#Authors: Ian McElveen and Cecilia Gonzales
#Author Date: 7/14/2025
#Purpose: The purpose of this notebook is to house all data set transformation, cleansing, visualization, statistical analysis, and note-taking for the 2025 CU Athletic Department Sports Science Internship Program

#LAST UPDATED: 8/5/2025

#Including helpful libraries
library(tidyverse)
── Attaching core tidyverse packages ─────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.2     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.4     
── Conflicts ───────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(readxl)
library(aod)
library(gt)
G2;H2;Warningh: package ‘gt’ was built under R version 4.5.1g
library(boot)
library(mgcv)
G3;Loading required package: nlme
gG3;
Attaching package: ‘nlme’

gG3;The following object is masked from ‘package:dplyr’:

    collapse

gG3;This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
gG3;
Attaching package: ‘mgcv’

gG3;The following object is masked from ‘package:aod’:

    negbin

g
library(lme4)
G3;Loading required package: Matrix
gG3;
Attaching package: ‘Matrix’

gG3;The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack

gG3;
Attaching package: ‘lme4’

gG3;The following object is masked from ‘package:nlme’:

    lmList

g
library(leaps)
G2;H2;Warningh: package ‘leaps’ was built under R version 4.5.1g

Data Cleaning

#loading in the Catapult data to look at sprinting values
Catapult_Session <- read_csv("data-sets/data-sets-uncompressed/data-sets-compressed/Running Imbalance and Speed/Catapult Session - Outdoor FB.csv")
New names:
• `` -> `...1`
Rows: 77140 Columns: 491
── Column specification ───────────────────────────────────────────────────────────────────────
Delimiter: ","
chr   (33): anon_id, Date, Gender, Sport, Primary.Position, Day.of.the.Week, Month, Month.w...
dbl  (427): ...1, Age, Year, Period.Number, Total.Duration..min., Total.Distance, Velocity....
lgl   (27): Deceleration.B3.Total.Effort, Deceleration.B2.Total.Effort, Deceleration.B1.Tot...
time   (4): Start, End, Total.Duration, Field.Time

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#loading in the Historical Running data to look at running imbalance values
Historical_Running <- read_csv("data-sets/data-sets-uncompressed/data-sets-compressed/Running Imbalance and Speed/Compiled Historical Running Imbalance FB.csv")
Rows: 9942 Columns: 4
── Column specification ───────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (4): anon_id, Name, Date, Running.Imbalance

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#loading in the Incident Report to look at HSIs
Incident_Report <- read_csv("data-sets/data-sets-uncompressed/data-sets-compressed/Running Imbalance and Speed/Incident Report FB IDs.csv")
New names:
• `` -> `...1`
G2;H2;Warningh: One or more parsing issues, call `problems()` on your data frame for details, e.g.:
  dat <- vroom(...)
  problems(dat)g
Rows: 2220 Columns: 225
── Column specification ───────────────────────────────────────────────────────────────────────
Delimiter: ","
chr  (163): anon_id, Date, Sport, Position, Medical.Alerts, View.Medical.Alerts, Incident.T...
dbl   (31): ...1, Days.to.Exam, Days.in.Status, Games.Missed, Injury.to.Exam.Duration, Inju...
lgl   (28): MRN, Converted.to.injury.from.maintenance.record, What.sport.is.this.incident.r...
time   (3): Time.of.Injury, Time.Entered.On, Time.Stamp

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Catapult_Session_clean <- Catapult_Session %>%
  #putting the date as a date class
  mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
  #only selecting important columns for this analysis
  select(anon_id, Date, Age, Primary.Position, Total.Distance, Period.Name, Total.Duration..min., Velocity.Band.1.Total.Distance, Velocity.Band.2.Total.Distance, Velocity.Band.3.Total.Distance, Velocity.Band.4.Total.Distance, Velocity.Band.5.Total.Distance, Velocity.Band.6.Total.Distance, Velocity.Band.7.Total.Distance, Velocity.Band.8.Total.Distance, Velocity.Band.2.Total.Effort.Count, Velocity.Band.3.Total.Effort.Count, Velocity.Band.4.Total.Effort.Count, Velocity.Band.5.Total.Effort.Count, Velocity.Band.6.Total.Effort.Count, Velocity.Band.7.Total.Effort.Count, Velocity.Band.8.Total.Effort.Count, Maximum.Velocity, Average.Velocity, Hit.90.Percent.Max, Date.of.Last.90.Effort, Days.Since.Last.90.Effort, Hit.Max.Velocity., Date.of.All.Time.Max.Velocity, Days.Since.Max.Velocity, Session.Max.Velocity) %>%
  #calculating each player's maximum velocity
  group_by(anon_id) %>%
  mutate(Player.Max.Velocity = max(na.omit(Maximum.Velocity))) %>%
  ungroup() %>%
  #only selecting data from January 1, 2024 and on
  filter(Date >= "2024-01-01")

head(Catapult_Session_clean)
Historical_Running_clean <- Historical_Running %>%
  #taking out rows that don't have data
  filter(Running.Imbalance != "n/a") %>%
  #putting running imbalance as a number and converting the date to a date class
  mutate(Running.Imbalance = as.numeric(Running.Imbalance),
         Date = as.Date(Date, "%m/%d/%Y")) %>%
  #only using data from January 1, 2024 and on
  filter(Date >= "2024-01-01") %>%
  mutate(X=1:4063) %>%
  #making days since January 1, 2024 for each player
  group_by(anon_id) %>%
  mutate(Days.Since.Start = as.numeric(Date - min(Date))) %>%
  ungroup()

head(Historical_Running_clean)
Incident_Report_clean <- Incident_Report %>%
  #filtering for only hamstring injuries
  filter(OSICS14.Code == "TM1",
         Status != "Full Go")  %>%
  #getting the date of the injury as a date class
  mutate(Date = as.Date(Date, "%m/%d/%Y"),
         Date.of.Injury = as.Date(Date.of.Injury...Onset.of.symptoms, "%m/%d/%Y"),
         Examination.Date = as.Date(Examination.Date, "%m/%d/%Y")) %>%
  #only selecting relevant columns for this analysis
  select(anon_id, Position, Date, Date.of.Injury, Time.of.Injury, Side, OSICS.Injury.Diagnosis, Coach.s.Diagnosis, Recurrence.of.Injury, Choose.Season, Onset.of.Symptoms, Injury.Prognosis, General.Mechanism, Specific.Mechanism, Injured.While., Type.of.Event, Season., Status, Days.in.Status) %>%
  #making days out due to injury for each player and each injury they sustained
  group_by(anon_id, Date.of.Injury) %>%
  mutate(Days.Out = sum(Days.in.Status)) %>%
  ungroup()

head(Incident_Report_clean)
#taking the IDs of players who are and aren't injured
all_IDs <- unique(Historical_Running_clean$anon_id)
#taking IDs that were injured and also have running imbalance data
injured_IDs <- intersect(unique(Incident_Report_clean$anon_id), all_IDs)
#taking all players with running imbalance data that don't have an injury
uninjured_IDs <- unique((Historical_Running_clean %>%
  filter(!anon_id %in% injured_IDs))$anon_id)
#injured players who also have running imbalance data
Incident_Report_clean <- Incident_Report_clean %>%
  filter(anon_id %in% injured_IDs)

#all players that only have running imbalance data or have both running imbalance data and incidence report
Historical_Running_clean <- Historical_Running_clean %>%
  filter(anon_id %in% injured_IDs | anon_id %in% uninjured_IDs)
#removing uncleaned data sets
remove(Incident_Report)
remove(Historical_Running)
remove(Catapult_Session)

Section 1: Running Speed

How often are athletes reaching ≥ 90% maximum velocity throughout a training season?

# Bar chart for how often players reach ≥ 90% maximum velocity

# Count for how many times each anon_id hit ≥ 90% maximum velocity
hit_90_counts <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>% # Filter for training season
  filter(Hit.90.Percent.Max == "Yes") %>%
  distinct(anon_id, Date, Primary.Position) %>%
  group_by(anon_id, Primary.Position) %>%
  summarise(times_hit_90 = n(), .groups = "drop") %>%
  mutate(
    Position_Group = case_when(
      Primary.Position %in% c("QB", "LB", "TE", "RB") ~ "COMBO",
      Primary.Position %in% c("OL", "DL", "DE") ~ "BIGS",
      Primary.Position %in% c("WR", "DB", "DB, WR") ~ "SKILL",
      TRUE ~ "OTHER"
    ))

# Plot of all players' frequencies
ggplot(hit_90_counts, aes(x = anon_id, y = times_hit_90)) +
  geom_bar(stat = "identity", fill = "#CFB87C") +
  geom_hline(yintercept = mean(hit_90_counts$times_hit_90), 
             linetype = "dashed", color = "#565A5C", linewidth = 0.5) +
  labs(title = "Player Counts for Achieving ≥ 90% of Maximum Velocity During 2024–25 Season", 
       subtitle = paste("Team Average:", round(mean(hit_90_counts$times_hit_90), 1)),
       x = "Athlete ID", y = "Times ≥ 90%") +
  theme_classic() +
  theme(axis.text.x = element_text(size = 6, angle = 90))

Do not need because we created a function to plot each position

# Bar chart of times reached >90%, Quarterbacks

# Filter to only have data for QBs
QBs <- hit_90_counts %>%
  filter(Primary.Position == "QB")

# Calculate the averages first
overall_avg <- mean(hit_90_counts$times_hit_90)
qb_avg <- mean(QBs$times_hit_90)

ggplot(QBs, aes(x=anon_id, y=times_hit_90)) +
  geom_bar(stat="identity", fill = "#CFB87C") + 
  geom_text(aes(label = times_hit_90), 
            vjust = -0.5, 
            size = 3.5) +
  geom_hline(yintercept = overall_avg, 
             linetype = "dashed", color = "#000000") +
  geom_hline(yintercept = qb_avg, 
             linetype = "dashed", color = "#565A5C") +
  annotate("text", x = 1, y = overall_avg + 0.5, 
           label = "Team Avg", color = "#000000", size = 3) +
  annotate("text", x = 1, y = qb_avg + 0.5, 
           label = "QB Avg", color = "#565A5C", size = 3) +
  labs(title = "QB Counts for Reaching ≥90% Max Velocity", 
       x = "Athlete ID", y = "Times ≥90%",
       subtitle = paste("QB Average:", qb_avg)) +
  theme_classic()

Facet plot useful, but hard to read so probably don’t include in presentation.

# Facet Plot

# Calculate overall average
overall_avg <- mean(hit_90_counts$times_hit_90)

# Plot faceted bar charts by position
ggplot(hit_90_counts, aes(x = anon_id, y = times_hit_90)) +
  geom_bar(stat = "identity", fill = "#CFB87C") +
  geom_text(aes(label = times_hit_90), vjust = -0.5, size = 3.5) +
  geom_hline(yintercept = overall_avg, linetype = "dashed", color = "#000000") +
  annotate("text", x = 1, y = overall_avg + 0.5, 
           label = "Team Avg", color = "#000000", size = 3, hjust = 0) +
  facet_wrap(~ Primary.Position, scales = "free_x") +
  labs(title = "Counts of ≥90% Max Velocity by Player and Position",
       y = "Times ≥90% Max Velocity",
       x = "Athlete ID") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plots for each position

# Get the overall team average once from hit_90_counts
team_avg <- mean(hit_90_counts$times_hit_90)

# Define  positions to loop through
positions <- unique(hit_90_counts$Primary.Position)

# Plotting function:
plot_hit_90_by_position <- function(pos) {
  position_data <- hit_90_counts %>%
    filter(Primary.Position == pos)
  
  pos_avg <- mean(position_data$times_hit_90)
  
  ggplot(position_data, aes(x = anon_id, y = times_hit_90)) +
    geom_bar(stat = "identity", fill = "#CFB87C") +
    geom_text(aes(label = times_hit_90), vjust = -0.5, size = 3.5) +
    geom_hline(yintercept = pos_avg, linetype = "dashed", color = "#565A5C") +
    annotate("text", x = 1, y = pos_avg + 0.5, 
             label = "Pos Avg", color = "#565A5C", size = 3, hjust = 0) +
    geom_hline(yintercept = team_avg, linetype = "dashed", color = "#000000") +
    annotate("text", x = 1, y = team_avg + 0.5, 
             label = "Team Avg", color = "#000000", size = 3, hjust = 0) +
    labs(title = paste("Times ≥90% Max Velocity –", pos),
         subtitle = paste0("Position Avg: ", round(pos_avg, 1),
                           " | Team Avg: ", round(team_avg, 1)),
         y = "Count",
         x = "anon_id") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

# Loop through each position and print the plot
plots_by_position <- lapply(positions, function(pos) {
  print(plot_hit_90_by_position(pos))
})

# Table for average values of each position

position_averages <- hit_90_counts %>%
  group_by(Primary.Position) %>%
  summarise(avg_times_hit_90 = mean(times_hit_90), .groups = "drop") %>%
  arrange(desc(avg_times_hit_90))

position_averages_with_team <- bind_rows(
  tibble(
    Primary.Position = "Team Average",
    avg_times_hit_90 = team_avg
  ),
  position_averages
)

position_averages_with_team %>%
  gt() %>%
  tab_header(
    title = "Average Times ≥90% Max Velocity by Position and Team"
  )
Average Times ≥90% Max Velocity by Position and Team
Primary.Position avg_times_hit_90
Team Average 11.768421
DB 18.352941
WR 15.666667
DL 10.789474
RB 10.500000
QB 9.750000
LB 8.916667
TE 8.400000
OL 6.764706

DBs have the highest average times reaching ≥90% of Max velocity (18.4). The OL had the lowest (6.8). Team average was 11.8.

# Plot counts for each position group

# Get the overall team average once from hit_90_counts
team_avg <- mean(hit_90_counts$times_hit_90)

# Define  positions to loop through
positions_groups <- unique(hit_90_counts$Position_Group)

# Plotting function:
plot_hit_90_group <- function(pos) {
  group_data <- hit_90_counts %>%
    filter(Position_Group == pos)
  
  group_avg <- mean(group_data$times_hit_90)
  
  ggplot(group_data, aes(x = anon_id, y = times_hit_90)) +
    geom_bar(stat = "identity", fill = "#CFB87C") +
    geom_text(aes(label = times_hit_90), vjust = -0.5, size = 3.5) +
    geom_hline(yintercept = group_avg, linetype = "dashed", color = "#565A5C") +
    annotate("text", x = 1, y = group_avg + 0.5, 
             label = "Group Avg", color = "#565A5C", size = 3, hjust = 0) +
    geom_hline(yintercept = team_avg, linetype = "dashed", color = "#000000") +
    annotate("text", x = 1, y = team_avg + 0.5, 
             label = "Team Avg", color = "#000000", size = 3, hjust = 0) +
    labs(title = paste("Times ≥90% Max Velocity –", pos),
         subtitle = paste0("Group Avg: ", round(group_avg, 1),
                           " | Team Avg: ", round(team_avg, 1)),
         y = "Count",
         x = "anon_id") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

# Loop through each position and print the plot
plots_by_group <- lapply(positions_groups, function(pos) {
  print(plot_hit_90_group(pos))
})

The skill position group has the highest average times reached ≥90% Max with 17.1, then combo with 9.3, then bigs with 8.9.

# Make facet plot so that it fits on the presentation slides better.

# Calculate averages for each group and the team
group_avgs <- hit_90_counts %>%
  group_by(Position_Group) %>%
  summarise(group_avg = mean(times_hit_90))

team_avg <- mean(hit_90_counts$times_hit_90)

# Join group averages back to the main data
plot_data <- hit_90_counts %>%
  left_join(group_avgs, by = "Position_Group")

# Create label data with formatted strings
avg_labels <- plot_data %>%
  group_by(Position_Group) %>%
  summarise(
    group_avg = unique(group_avg),
    team_avg = team_avg,
    label_x = 1,
    group_label_y = group_avg + 1,
    team_label_y = team_avg + 1,
    group_label = round(group_avg, 1),
    team_label = round(team_avg, 1),
    .groups = "drop"
  )

# Plot
ggplot(plot_data, aes(x = anon_id, y = times_hit_90)) +
  geom_bar(stat = "identity", fill = "#CFB87C") +
  geom_hline(aes(yintercept = group_avg), linetype = "dashed", color = "#000000") +
  geom_hline(yintercept = team_avg, linetype = "dotted", color = "#565A5C") +
  geom_text(data = avg_labels, aes(x = label_x, y = group_label_y, label = group_label),
            inherit.aes = FALSE, hjust = 0, size = 3, color = "#000000") +
  geom_text(data = avg_labels, aes(x = label_x, y = team_label_y, label = team_label),
            inherit.aes = FALSE, hjust = 0, size = 3, color = "#565A5C") +
  facet_wrap(~ Position_Group, scales = "free_x") +
  labs(title = "Times Reached ≥90% Max Velocity by Position Group",
       y = "Count", x = "anon_id") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# Create position group avg count table

# Create team average row
team_row <- tibble(
  Group = "Team Avg",
  Average = team_avg
)

# Create group average rows
group_rows <- hit_90_counts %>%
  group_by(Position_Group) %>%
  summarise(Average = mean(times_hit_90), .groups = "drop") %>%
  arrange(desc(Average)) %>%  # Sort from highest to lowest
  rename(Group = Position_Group)

# Combine them (team first)
combined_table <- bind_rows(team_row, group_rows)

# View the table
combined_table

No surprise that the skill position group has the highest average counts.

# >90% counts over the course of the season

# Trying to find out when are players reaching above 90% the most

# Create dataset that has the count of >90% of each day
daily_90_counts <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
  filter(Hit.90.Percent.Max == "Yes") %>%
  distinct(anon_id, Date) %>%
  group_by(Date) %>%
  summarise(daily_hits = n())

# Plot
ggplot(daily_90_counts, aes(x = Date, y = daily_hits)) +
  geom_line(color = "#CFB87C", linewidth = 1) +
  geom_smooth(method = "loess", se = FALSE, color = "#565A5C", linetype = "dashed") +
  labs(title = "Daily Count of Players Reaching ≥ 90% of Max Velocity",
       subtitle = "Over the 2024–25 Training Season",
       x = "Date", y = "Times reached ≥ 90%") +
  theme_classic()

Data is noisy, so lets try grouping by week instead of every day.

# >90% counts for each week instead of each day. Will be a little less noisy than the daily
weekly_90_counts <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
  filter(Hit.90.Percent.Max == "Yes") %>%
  distinct(anon_id, Date) %>%
  mutate(week = floor_date(Date, unit = "week")) %>%
  group_by(week) %>%
  summarise(weekly_hits = n())

mean_val <- round(mean(weekly_90_counts$weekly_hits), 1)

ggplot(weekly_90_counts, aes(x = week, y = weekly_hits)) +
  geom_line(color = "#CFB87C", linewidth = 1) +
  geom_point(color = "#000000") +
  geom_smooth(method = "loess", se = FALSE, color = "#565A5C", linetype = "dashed", linewidth = 0.6) +
  labs(title = "Weekly Count of Players Reaching ≥ 90% of Max Velocity",
       subtitle = paste0("Week Average: ", mean_val),
       x = "Week", y = "Times reached ≥ 90%") +
  theme_classic()

# Bar chart of weekly >90% counts

ggplot(weekly_90_counts, aes(x = week, y = weekly_hits)) +
  geom_col(fill = "#CFB87C", color = "#565A5C", width = 5) +
  geom_text(aes(label = weekly_hits), 
            vjust = -0.4, 
            size = 2.5, 
            color = "black") +
  geom_hline(yintercept = mean_val,
             linetype = "dashed", color = "#565A5C", linewidth = 0.5) +
  annotate("text", 
           x = min(weekly_90_counts$week) + 7,  # Adjust this date to place the label
           y = mean_val + 1.5,                 # Slightly above the line
           label = paste("Mean:", round(mean_val, 1)),
           size = 3.25, color = "#000000") +
  labs(
    title    = "Weekly Count of Players Reaching ≥ 90% of Max Velocity",
    subtitle = paste("Season Average per Week:", round(mean(weekly_90_counts$weekly_hits), 1)),
    x        = "Week",
    y        = "Times reached ≥ 90%"
  ) +
  scale_x_date(date_breaks = "2 weeks", date_labels = "%b %d") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Some of our lowest average counts for a week seem to be during the season, whereas some of our highest counts seem to be pre and post-season.

Should we consider the number of sprinting efforts that athletes are completing?

How often athletes are hitting which velocity bands in relation to their top speeds.

# Create sum of weekly band totals

# Make sure week is defined
Catapult_Session_clean <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>% 
  mutate(week = floor_date(Date, unit = "week"))

# Sum efforts by week and athlete
weekly_velocity_efforts <- Catapult_Session_clean %>%
  group_by(anon_id, week) %>%
  summarise(
    V2 = sum(Velocity.Band.2.Total.Effort.Count, na.rm = TRUE),
    V3 = sum(Velocity.Band.3.Total.Effort.Count, na.rm = TRUE),
    V4 = sum(Velocity.Band.4.Total.Effort.Count, na.rm = TRUE),
    V5 = sum(Velocity.Band.5.Total.Effort.Count, na.rm = TRUE),
    V6 = sum(Velocity.Band.6.Total.Effort.Count, na.rm = TRUE),
    V7 = sum(Velocity.Band.7.Total.Effort.Count, na.rm = TRUE),
    V8 = sum(Velocity.Band.8.Total.Effort.Count, na.rm = TRUE),
    total_efforts = V2 + V3 + V4 + V5 + V6 + V7 + V8,
    Weekly_Max_Velocity = max(Maximum.Velocity, na.rm = TRUE),
    Player_Max_Velocity = first(Player.Max.Velocity),
    .groups = "drop"
  ) %>%
  mutate(
    pct_of_max_velocity = (Weekly_Max_Velocity / Player_Max_Velocity) * 100
  )

# Create percentage of weekly efforts in each band
weekly_velocity_efforts <- weekly_velocity_efforts %>%
  mutate(
    pct_V2 = (V2 / total_efforts) * 100,
    pct_V3 = (V3 / total_efforts) * 100,
    pct_V4 = (V4 / total_efforts) * 100,
    pct_V5 = (V5 / total_efforts) * 100,
    pct_V6 = (V6 / total_efforts) * 100,
    pct_V7 = (V7 / total_efforts) * 100,
    pct_V8 = (V8 / total_efforts) * 100
  )

weekly_velocity_efforts <- weekly_velocity_efforts %>%
  filter(total_efforts > 0)
# Plot for each player

# Loop over all players and display their plots
unique(weekly_velocity_efforts$anon_id) %>%
  lapply(function(player_id) {
    
    # Prepare the data for one player
    player_weekly_data <- weekly_velocity_efforts %>%
      filter(anon_id == player_id) %>%
      select(week, pct_V2:pct_V8, pct_of_max_velocity) %>%
      pivot_longer(
        cols = starts_with("pct_V"),
        names_to = "velocity_band",
        values_to = "percent_effort"
      ) %>%
      mutate(
        velocity_band = factor(
          velocity_band,
          levels = paste0("pct_V", 8:2),
          labels = paste0("V", 8:2)
        )
      )
    
    # Skip empty data
    if (nrow(player_weekly_data) == 0) return(NULL)
    
    # Generate and print the plot
    plot <- ggplot(player_weekly_data, aes(x = week)) +
      geom_col(aes(y = percent_effort, fill = velocity_band), position = "stack") +
      geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
      geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
      scale_fill_manual(
        values = c(
          "V2" = "darkgreen",
          "V3" = "green2",
          "V4" = "greenyellow",
          "V5" = "yellow",
          "V6" = "orange",
          "V7" = "tomato",
          "V8" = "firebrick"
  )
) +
    scale_y_continuous(
        name = "Band Distribution (%)",
        sec.axis = sec_axis(~ ., name = "Weekly Max Velocity (% of PR)")
      ) +
      labs(
        title = paste("Velocity Band Effort % and Speed Trend for Player", player_id),
        x = "Week",
        fill = "Velocity Band"
      ) +
      theme_classic()
    
    print(plot)  # Display plot
    
    return(NULL)
  })
G2;H2;Warningh: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_]8;;ide:run:warnings()warnings()]8;;` to see where this warning was generated.g
[[1]]
NULL

[[2]]
NULL

[[3]]
NULL

[[4]]
NULL

[[5]]
NULL

[[6]]
NULL

[[7]]
NULL

[[8]]
NULL

[[9]]
NULL

[[10]]
NULL

[[11]]
NULL

[[12]]
NULL

[[13]]
NULL

[[14]]
NULL

[[15]]
NULL

[[16]]
NULL

[[17]]
NULL

[[18]]
NULL

[[19]]
NULL

[[20]]
NULL

[[21]]
NULL

[[22]]
NULL

[[23]]
NULL

[[24]]
NULL

[[25]]
NULL

[[26]]
NULL

[[27]]
NULL

[[28]]
NULL

[[29]]
NULL

[[30]]
NULL

[[31]]
NULL

[[32]]
NULL

[[33]]
NULL

[[34]]
NULL

[[35]]
NULL

[[36]]
NULL

[[37]]
NULL

[[38]]
NULL

[[39]]
NULL

[[40]]
NULL

[[41]]
NULL

[[42]]
NULL

[[43]]
NULL

[[44]]
NULL

[[45]]
NULL

[[46]]
NULL

[[47]]
NULL

[[48]]
NULL

[[49]]
NULL

[[50]]
NULL

[[51]]
NULL

[[52]]
NULL

[[53]]
NULL

[[54]]
NULL

[[55]]
NULL

[[56]]
NULL

[[57]]
NULL

[[58]]
NULL

[[59]]
NULL

[[60]]
NULL

[[61]]
NULL

[[62]]
NULL

[[63]]
NULL

[[64]]
NULL

[[65]]
NULL

[[66]]
NULL

[[67]]
NULL

[[68]]
NULL

[[69]]
NULL

[[70]]
NULL

[[71]]
NULL

[[72]]
NULL

[[73]]
NULL

[[74]]
NULL

[[75]]
NULL

[[76]]
NULL

[[77]]
NULL

[[78]]
NULL

[[79]]
NULL

[[80]]
NULL

[[81]]
NULL

[[82]]
NULL

[[83]]
NULL

[[84]]
NULL

[[85]]
NULL

[[86]]
NULL

[[87]]
NULL

[[88]]
NULL

[[89]]
NULL

[[90]]
NULL

[[91]]
NULL

[[92]]
NULL

[[93]]
NULL

[[94]]
NULL

[[95]]
NULL

[[96]]
NULL

[[97]]
NULL

[[98]]
NULL

[[99]]
NULL

[[100]]
NULL

[[101]]
NULL

[[102]]
NULL

[[103]]
NULL

[[104]]
NULL

Each player is different in reaching different velocity bands in relation to their top speeds. Some players, ID_11 for example, rarely enter bands 6 or higher, even if they are close to 100% of their max velocity. This shows the issues with using absolute bands for the whole team.

# Filter for ID_11 and reshape the data
player_weekly_data <- weekly_velocity_efforts %>%
  filter(anon_id == "ID_11") %>%
  select(week, pct_V2:pct_V8, pct_of_max_velocity) %>%
  pivot_longer(
    cols = starts_with("pct_V"),
    names_to = "velocity_band",
    values_to = "percent_effort"
  ) %>%
  mutate(
    velocity_band = factor(
      velocity_band,
      levels = paste0("pct_V", 8:2),
      labels = paste0("V", 8:2)
    )
  )

# Plot for ID_11
ggplot(player_weekly_data, aes(x = week)) +
  geom_col(aes(y = percent_effort, fill = velocity_band), position = "stack") +
  geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
  geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
  scale_fill_manual(
    values = c(
      "V2" = "darkgreen",
      "V3" = "green2",
      "V4" = "greenyellow",
      "V5" = "yellow",
      "V6" = "orange",
      "V7" = "tomato",
      "V8" = "firebrick"
    )
  ) +
  scale_y_continuous(
    name = "Band Distribution (%)",
    sec.axis = sec_axis(~ ., name = "Weekly Max Velocity (% of PR)")
  ) +
  labs(
    title = "Velocity Band Effort % and Speed Trend for Player ID_11",
    x = "Week",
    fill = "Velocity Band"
  ) +
  theme_classic()

# Pick one player
player_id <- "ID_93"

# Filter data for that player and weeks
player_data <- weekly_velocity_efforts %>%
  filter(anon_id == player_id)

# Plot of Total Sprinting Efforts per Week
ggplot(player_data, aes(x = week, y = total_efforts)) +
  geom_line(color = "#CFB87C", size = 1) +
  geom_point(color = "#CFB87C", size = 2) +
  labs(
    title = paste("Total Sprinting Efforts per Week for Player", player_id),
    x = "Week",
    y = "Total Sprinting Efforts"
  ) +
  theme_classic()


#Plot of Sprint Effort Distribution by Velocity Band

# Reshape absolute counts to long format
player_counts_long <- player_data %>%
  select(week, V2, V3, V4, V5, V6, V7, V8) %>%
  pivot_longer(
    cols = V2:V8,
    names_to = "velocity_band",
    values_to = "effort_count"
  ) %>%
  mutate(
    velocity_band = factor(velocity_band, levels = paste0("V", 8:2))
  )

ggplot(player_counts_long, aes(x = week, y = effort_count, fill = velocity_band)) +
  geom_col(position = "stack") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = paste("Sprint Effort Distribution by Velocity Band for Player", player_id),
    x = "Week",
    y = "Sprint Effort Count",
    fill = "Velocity Band"
  ) +
  theme_classic()

Exploring correlations between bands and max speeds



# Select relevant columns
cor_data <- weekly_velocity_efforts %>%
  select(pct_of_max_velocity, pct_V2, pct_V3, pct_V4, pct_V5, pct_V6, pct_V7, pct_V8)

# Compute correlation matrix
cor_matrix <- cor(cor_data, use = "pairwise.complete.obs")

cor_matrix
                    pct_of_max_velocity     pct_V2      pct_V3     pct_V4     pct_V5
pct_of_max_velocity          1.00000000 -0.5213861  0.05869457  0.4422664  0.4482732
pct_V2                      -0.52138614  1.0000000 -0.15363378 -0.8795243 -0.8605602
pct_V3                       0.05869457 -0.1536338  1.00000000  0.3272849 -0.2133725
pct_V4                       0.44226643 -0.8795243  0.32728485  1.0000000  0.6831749
pct_V5                       0.44827321 -0.8605602 -0.21337249  0.6831749  1.0000000
pct_V6                       0.41150125 -0.7765560 -0.37704110  0.4754871  0.8333338
pct_V7                       0.33727593 -0.6129867 -0.43386873  0.2712374  0.5853417
pct_V8                       0.33778410 -0.5101834 -0.42332967  0.1965351  0.4713043
                        pct_V6     pct_V7     pct_V8
pct_of_max_velocity  0.4115013  0.3372759  0.3377841
pct_V2              -0.7765560 -0.6129867 -0.5101834
pct_V3              -0.3770411 -0.4338687 -0.4233297
pct_V4               0.4754871  0.2712374  0.1965351
pct_V5               0.8333338  0.5853417  0.4713043
pct_V6               1.0000000  0.8067562  0.6559091
pct_V7               0.8067562  1.0000000  0.8731903
pct_V8               0.6559091  0.8731903  1.0000000
# Compute correlations with pct_of_max_velocity
cor_values <- cor_data %>%
  summarise(across(-pct_of_max_velocity,
                   ~ cor(.x, cor_data$pct_of_max_velocity, use = "pairwise.complete.obs"))) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "correlation")

# Plot with labels
ggplot(cor_values, aes(x = reorder(variable, correlation), y = correlation)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = round(correlation, 2)), 
            hjust = ifelse(cor_values$correlation >= 0, -0.1, 1.1), 
            color = "black", size = 3) +
  coord_flip() +
  geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
  labs(
    title = "Correlation Between Percent in Band and Percent of Max Velocity",
    x = "Band",
    y = "Pearson Correlation"
  ) +
  theme_classic() 

V4 and V5 appear to be slightly more predictive of top-speed achievement than V6, V7 or V8 — possibly because they’re more frequently reached zones.

V2 has a moderately strong negative linear relationship with reaching top-speed. When athletes spend more effort in this low-speed band, their weekly top speed (as % of max) tends to be lower.

# Linear model with all bands predicting pct_of_max_velocity
model_all_bands <- lm(
  pct_of_max_velocity ~ pct_V2 + pct_V3 + pct_V4 + pct_V5 + pct_V6 + pct_V7 + pct_V8,
  data = weekly_velocity_efforts
)
summary(model_all_bands)

Call:
lm(formula = pct_of_max_velocity ~ pct_V2 + pct_V3 + pct_V4 + 
    pct_V5 + pct_V6 + pct_V7 + pct_V8, data = weekly_velocity_efforts)

Residuals:
    Min      1Q  Median      3Q     Max 
-60.296  -4.649   1.038   5.861  24.418 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 296.9092    32.0021   9.278  < 2e-16 ***
pct_V2       -2.3814     0.3220  -7.396 2.11e-13 ***
pct_V3       -1.8554     0.3234  -5.737 1.12e-08 ***
pct_V4       -1.9669     0.3371  -5.834 6.35e-09 ***
pct_V5       -1.8813     0.3535  -5.322 1.15e-07 ***
pct_V6       -1.7832     0.3502  -5.092 3.90e-07 ***
pct_V7       -3.1034     0.5924  -5.238 1.80e-07 ***
pct_V8            NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9.707 on 1872 degrees of freedom
Multiple R-squared:  0.2859,    Adjusted R-squared:  0.2836 
F-statistic: 124.9 on 6 and 1872 DF,  p-value: < 2.2e-16
# Compute change in top speeds week-to-week
weekly_velocity_efforts <- weekly_velocity_efforts %>%
  arrange(anon_id, week) %>%
  group_by(anon_id) %>%
  mutate(
    pct_of_max_velocity_change = pct_of_max_velocity - lag(pct_of_max_velocity)
  ) %>%
  ungroup()

weekly_velocity_efforts <- weekly_velocity_efforts %>%
  group_by(anon_id) %>%
  mutate(
    lag_V2 = lag(V2),
    lag_V3 = lag(V3),
    lag_V4 = lag(V4),
    lag_V5 = lag(V5),
    lag_V6 = lag(V6),
    lag_V7 = lag(V7),
    lag_V8 = lag(V8),
    
    lag_pct_V2 = lag(pct_V2),
    lag_pct_V3 = lag(pct_V3),
    lag_pct_V4 = lag(pct_V4),
    lag_pct_V5 = lag(pct_V5),
    lag_pct_V6 = lag(pct_V6),
    lag_pct_V7 = lag(pct_V7),
    lag_pct_V8 = lag(pct_V8)
  ) %>%
  ungroup()
# Maybe group by position

# Get Player positions
player_positions <- Catapult_Session_clean %>%
  select(anon_id, Primary.Position) %>%
  distinct()

# Join Position into weekly_velocity_efforts
weekly_velocity_efforts <- weekly_velocity_efforts %>%
  left_join(player_positions, by = "anon_id")
G2;H2;Warningh in left_join(., player_positions, by = "anon_id") :
  Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 310 of `x` matches multiple rows in `y`.
ℹ Row 78 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence
  this warning.g
# Position groups
weekly_velocity_efforts <- weekly_velocity_efforts %>%
  mutate(
    Position_Group = case_when(
      Primary.Position %in% c("QB", "LB", "TE", "RB") ~ "COMBO",
      Primary.Position %in% c("OL", "DL", "DE") ~ "BIGS",
      Primary.Position %in% c("WR", "DB", "DB, WR") ~ "SKILL",
      TRUE ~ "OTHER"
    )
  ) %>%
  filter(Position_Group != "OTHER")
G1;Error in `mutate()`:
ℹ In argument: `Position_Group = case_when(...)`.
Caused by error in `case_when()`:
! Failed to evaluate the left-hand side of formula 1.
Caused by error:
! object 'Primary.Position' not found
Run `]8;;x-r-run:rlang::last_trace()rlang::last_trace()]8;;` to see where the error occurred.
g
weekly_band_effort_by_group <- weekly_velocity_efforts %>%
  group_by(week, Position_Group) %>%
  summarise(
    mean_V2 = mean(V2, na.rm = TRUE),
    mean_V3 = mean(V3, na.rm = TRUE),
    mean_V4 = mean(V4, na.rm = TRUE),
    mean_V5 = mean(V5, na.rm = TRUE),
    mean_V6 = mean(V6, na.rm = TRUE),
    mean_V7 = mean(V7, na.rm = TRUE),
    mean_V8 = mean(V8, na.rm = TRUE),
    
    mean_lag_V2 = mean(lag_V2, na.rm = TRUE),
    mean_lag_V3 = mean(lag_V3, na.rm = TRUE),
    mean_lag_V4 = mean(lag_V4, na.rm = TRUE),
    mean_lag_V5 = mean(lag_V5, na.rm = TRUE),
    mean_lag_V6 = mean(lag_V6, na.rm = TRUE),
    mean_lag_V7 = mean(lag_V7, na.rm = TRUE),
    mean_lag_V8 = mean(lag_V8, na.rm = TRUE),
    
    mean_pct_V2 = mean(pct_V2, na.rm = TRUE),
    mean_pct_V3 = mean(pct_V3, na.rm = TRUE),
    mean_pct_V4 = mean(pct_V4, na.rm = TRUE),
    mean_pct_V5 = mean(pct_V5, na.rm = TRUE),
    mean_pct_V6 = mean(pct_V6, na.rm = TRUE),
    mean_pct_V7 = mean(pct_V7, na.rm = TRUE),
    mean_pct_V8 = mean(pct_V8, na.rm = TRUE),
    mean_weekly_max_velocity = mean(Weekly_Max_Velocity, na.rm = TRUE),
    mean_pct_max_velocity = mean(pct_of_max_velocity, na.rm = TRUE),
    mean_pct_max_velocity_change = mean(pct_of_max_velocity_change, na.rm = TRUE),
    n_players = n()
  ) %>%
  ungroup()
`summarise()` has grouped output by 'week'. You can override using the `.groups` argument.

# Pivot longer to get bands in one column for easier plotting
band_long <- weekly_band_effort_by_group %>%
  pivot_longer(
    cols = starts_with("mean_pct_V"),
    names_to = "velocity_band",
    values_to = "mean_pct_effort"
  ) %>%
  mutate(
    velocity_band = factor(velocity_band, levels = paste0("mean_pct_V", 2:8))
  )

ggplot(band_long, aes(x = week, y = mean_pct_effort, color = velocity_band)) +
  geom_line(size = 1) +
  facet_wrap(~ Position_Group) +
  labs(
    title = "Weekly Mean % Effort in Velocity Bands by Position Group",
    x = "Week",
    y = "Mean % Effort",
    color = "Velocity Band"
  ) +
   scale_color_manual(
        values = c(
          "mean_pct_V2" = "darkgreen",
          "mean_pct_V3" = "green2",
          "mean_pct_V4" = "greenyellow",
          "mean_pct_V5" = "yellow",
          "mean_pct_V6" = "orange",
          "mean_pct_V7" = "tomato",
          "mean_pct_V8" = "firebrick"
  )
) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

This shows the mean percent of total efforts in each band over weeks by each position group.

# Plot mean weekly max velocity
ggplot(weekly_band_effort_by_group, aes(x = week, y = mean_weekly_max_velocity, color = Position_Group)) +
  geom_line(size = 1.2) +
  labs(
    title = "Mean Weekly Max Velocity by Position Group",
    x = "Week",
    y = "Mean Weekly Max Velocity"
  ) +
  theme_classic()



# Plot mean % max velocity change
ggplot(weekly_band_effort_by_group, aes(x = week, y = mean_pct_max_velocity_change, color = Position_Group)) +
  geom_line(size = 1.2) +
  labs(
    title = "Mean % Change in Max Velocity by Position Group",
    x = "Week",
    y = "Mean % Change in Max Velocity"
  ) +
  theme_classic()

ggplot() +
  # Plot the stacked lines for band effort %
  geom_line(data = band_long, aes(x = week, y = mean_pct_effort, color = velocity_band), size = 1) +
  geom_line(data = weekly_band_effort_by_group,
            aes(x = week, y = mean_weekly_max_velocity),
            color = "black", size = 1.2) +

  facet_wrap(~ Position_Group) +
  labs(
    title = "Weekly Mean % Effort in Velocity Bands by Position Group\nwith Mean Weekly Max Velocity",
    x = "Week",
    y = "Mean % Effort",
    color = "Velocity Band"
  ) +
  scale_color_manual(
        values = c(
          "mean_pct_V2" = "darkgreen",
          "mean_pct_V3" = "green2",
          "mean_pct_V4" = "greenyellow",
          "mean_pct_V5" = "yellow",
          "mean_pct_V6" = "orange",
          "mean_pct_V7" = "tomato",
          "mean_pct_V8" = "firebrick"
  )
) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Define max effort and max velocity range
max_effort <- 67
max_velocity <- max(weekly_band_effort_by_group$mean_weekly_max_velocity, na.rm = TRUE)

ggplot() +
  geom_line(data = band_long, aes(x = week, y = mean_pct_effort, color = velocity_band), size = 1) +
  
  # Scale max_velocity to % effort scale to overlay on left axis
  geom_line(data = weekly_band_effort_by_group, 
            aes(x = week, y = (mean_weekly_max_velocity / max_velocity) * max_effort, color = "Max Velocity"),
            size = 1.1) +
  
  facet_wrap(~ Position_Group) +
  scale_y_continuous(
    name = "Mean % Effort",
    sec.axis = sec_axis(~ . * max_velocity / max_effort, name = "Mean Weekly Max Velocity (m/s)")
  ) +
  scale_color_manual(
    values = c(
      "mean_pct_V2" = "darkgreen",
      "mean_pct_V3" = "green2",
      "mean_pct_V4" = "greenyellow",
      "mean_pct_V5" = "yellow",
      "mean_pct_V6" = "orange",
      "mean_pct_V7" = "tomato",
      "mean_pct_V8" = "firebrick",
      "Max Velocity" = "black"
    )
  ) +
  labs(
    title = "Weekly Mean % Effort in Velocity Bands by Position Group\nwith Mean Weekly Max Velocity",
    x = "Week",
    color = "Velocity Band"
  ) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Modeling

# Separate by position group
bigs <- weekly_band_effort_by_group %>%
  filter(Position_Group == "BIGS")

skill <- weekly_band_effort_by_group %>%
  filter(Position_Group == "SKILL")

combo <- weekly_band_effort_by_group %>%
  filter(Position_Group == "COMBO")

# Bigs Model
model_bigs <- lm(
  mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, data = bigs
  )
summary(model_bigs)

Call:
lm(formula = mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + 
    mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, 
    data = bigs)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.82832 -0.35903 -0.01466  0.49791  2.67987 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 15.0559323  0.4742880  31.744   <2e-16 ***
mean_lag_V2  0.0007666  0.0162677   0.047    0.963    
mean_lag_V3 -0.0174581  0.0608015  -0.287    0.776    
mean_lag_V4  0.0124617  0.0702459   0.177    0.861    
mean_lag_V5  0.0980525  0.0990405   0.990    0.331    
mean_lag_V6 -0.2662617  0.2211796  -1.204    0.239    
mean_lag_V7  0.5251046  0.5567021   0.943    0.354    
mean_lag_V8  0.6964538  0.5708904   1.220    0.233    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9501 on 27 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.4169,    Adjusted R-squared:  0.2658 
F-statistic: 2.758 on 7 and 27 DF,  p-value: 0.02682
# Skill Model
model_skill <- lm(
  mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, data = skill
  )
summary(model_skill)

Call:
lm(formula = mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + 
    mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, 
    data = skill)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3762 -0.4584  0.0883  0.6361  1.9607 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 16.875636   0.803735  20.997   <2e-16 ***
mean_lag_V2 -0.010484   0.020345  -0.515    0.610    
mean_lag_V3 -0.007903   0.053066  -0.149    0.883    
mean_lag_V4  0.015572   0.061910   0.252    0.803    
mean_lag_V5  0.036273   0.102055   0.355    0.725    
mean_lag_V6  0.066696   0.126898   0.526    0.603    
mean_lag_V7 -0.010326   0.149988  -0.069    0.946    
mean_lag_V8  0.077498   0.198001   0.391    0.698    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.295 on 29 degrees of freedom
Multiple R-squared:  0.2599,    Adjusted R-squared:  0.08124 
F-statistic: 1.455 on 7 and 29 DF,  p-value: 0.2224
# Combo Model
model_combo <- lm(
  mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, data = combo)
summary(model_combo)

Call:
lm(formula = mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + 
    mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, 
    data = combo)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.8950 -0.3423  0.1298  0.4643  1.3618 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 17.85151    0.42494  42.009   <2e-16 ***
mean_lag_V2  0.01297    0.01314   0.987    0.332    
mean_lag_V3 -0.04224    0.03598  -1.174    0.251    
mean_lag_V4  0.01894    0.04929   0.384    0.704    
mean_lag_V5  0.06684    0.08694   0.769    0.449    
mean_lag_V6 -0.12756    0.16048  -0.795    0.434    
mean_lag_V7  0.17000    0.27613   0.616    0.543    
mean_lag_V8 -0.04655    0.26492  -0.176    0.862    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.8189 on 27 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.2282,    Adjusted R-squared:  0.02812 
F-statistic: 1.141 on 7 and 27 DF,  p-value: 0.3681
# Group correlation:

# Select relevant variables
vars <- c("mean_weekly_max_velocity", "mean_lag_V2", "mean_lag_V3", 
          "mean_lag_V4", "mean_lag_V5", "mean_lag_V6", "mean_lag_V7", "mean_lag_V8")

# Correlation matrices
cor_bigs <- cor(bigs[, vars], use = "complete.obs")
cor_skill <- cor(skill[, vars], use = "complete.obs")
cor_combo <- cor(combo[, vars], use = "complete.obs")
round(cor_bigs, 2)
                         mean_weekly_max_velocity mean_lag_V2 mean_lag_V3 mean_lag_V4
mean_weekly_max_velocity                     1.00       -0.43       -0.38       -0.21
mean_lag_V2                                 -0.43        1.00        0.98        0.86
mean_lag_V3                                 -0.38        0.98        1.00        0.93
mean_lag_V4                                 -0.21        0.86        0.93        1.00
mean_lag_V5                                  0.16        0.28        0.36        0.60
mean_lag_V6                                  0.20        0.06        0.11        0.28
mean_lag_V7                                  0.35       -0.09       -0.03        0.12
mean_lag_V8                                  0.35        0.07        0.11        0.19
                         mean_lag_V5 mean_lag_V6 mean_lag_V7 mean_lag_V8
mean_weekly_max_velocity        0.16        0.20        0.35        0.35
mean_lag_V2                     0.28        0.06       -0.09        0.07
mean_lag_V3                     0.36        0.11       -0.03        0.11
mean_lag_V4                     0.60        0.28        0.12        0.19
mean_lag_V5                     1.00        0.83        0.61        0.27
mean_lag_V6                     0.83        1.00        0.88        0.39
mean_lag_V7                     0.61        0.88        1.00        0.62
mean_lag_V8                     0.27        0.39        0.62        1.00
round(cor_skill, 2)
                         mean_weekly_max_velocity mean_lag_V2 mean_lag_V3 mean_lag_V4
mean_weekly_max_velocity                     1.00        0.19        0.21        0.22
mean_lag_V2                                  0.19        1.00        1.00        0.99
mean_lag_V3                                  0.21        1.00        1.00        1.00
mean_lag_V4                                  0.22        0.99        1.00        1.00
mean_lag_V5                                  0.27        0.98        0.98        0.99
mean_lag_V6                                  0.35        0.92        0.93        0.94
mean_lag_V7                                  0.37        0.61        0.62        0.62
mean_lag_V8                                  0.40        0.55        0.56        0.56
                         mean_lag_V5 mean_lag_V6 mean_lag_V7 mean_lag_V8
mean_weekly_max_velocity        0.27        0.35        0.37        0.40
mean_lag_V2                     0.98        0.92        0.61        0.55
mean_lag_V3                     0.98        0.93        0.62        0.56
mean_lag_V4                     0.99        0.94        0.62        0.56
mean_lag_V5                     1.00        0.96        0.65        0.60
mean_lag_V6                     0.96        1.00        0.78        0.75
mean_lag_V7                     0.65        0.78        1.00        0.92
mean_lag_V8                     0.60        0.75        0.92        1.00
round(cor_combo, 2)
                         mean_weekly_max_velocity mean_lag_V2 mean_lag_V3 mean_lag_V4
mean_weekly_max_velocity                     1.00       -0.34       -0.33       -0.28
mean_lag_V2                                 -0.34        1.00        0.99        0.97
mean_lag_V3                                 -0.33        0.99        1.00        0.99
mean_lag_V4                                 -0.28        0.97        0.99        1.00
mean_lag_V5                                 -0.16        0.81        0.85        0.92
mean_lag_V6                                 -0.09        0.56        0.59        0.64
mean_lag_V7                                  0.04        0.17        0.19        0.23
mean_lag_V8                                  0.05        0.19        0.22        0.25
                         mean_lag_V5 mean_lag_V6 mean_lag_V7 mean_lag_V8
mean_weekly_max_velocity       -0.16       -0.09        0.04        0.05
mean_lag_V2                     0.81        0.56        0.17        0.19
mean_lag_V3                     0.85        0.59        0.19        0.22
mean_lag_V4                     0.92        0.64        0.23        0.25
mean_lag_V5                     1.00        0.83        0.46        0.43
mean_lag_V6                     0.83        1.00        0.84        0.71
mean_lag_V7                     0.46        0.84        1.00        0.89
mean_lag_V8                     0.43        0.71        0.89        1.00
# Reduce for better modeling
bigs <- bigs %>%
  mutate(
    low_band = (mean_lag_V2 + mean_lag_V3) / 2,
    mid_band = (mean_lag_V4 + mean_lag_V5 + mean_lag_V6) / 3,
    high_band = (mean_lag_V7 + mean_lag_V8) / 2
  )
model_bigs_simple <- lm(mean_weekly_max_velocity ~ low_band + mid_band + high_band, data = bigs)
summary(model_bigs_simple)

Call:
lm(formula = mean_weekly_max_velocity ~ low_band + mid_band + 
    high_band, data = bigs)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.62517 -0.48854 -0.01442  0.30496  2.80648 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 15.132025   0.468747  32.282   <2e-16 ***
low_band    -0.007602   0.003136  -2.424   0.0214 *  
mid_band     0.035535   0.034062   1.043   0.3049    
high_band    0.462722   0.320459   1.444   0.1588    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.9492 on 31 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.3317,    Adjusted R-squared:  0.267 
F-statistic: 5.129 on 3 and 31 DF,  p-value: 0.005368
# Skill Group
skill <- skill %>%
  mutate(
    low_band = (mean_lag_V2 + mean_lag_V3 + mean_lag_V4) / 3,
    mid_band = (mean_lag_V5 + mean_lag_V6) / 2,
    high_band = (mean_lag_V7 + mean_lag_V8) / 2
  )
model_skill_simple <- lm(mean_weekly_max_velocity ~ low_band + mid_band + high_band, data = skill)
summary(model_skill_simple)

Call:
lm(formula = mean_weekly_max_velocity ~ low_band + mid_band + 
    high_band, data = skill)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3878 -0.3887  0.0538  0.6168  1.9999 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 16.833855   0.671421  25.072   <2e-16 ***
low_band    -0.019113   0.009036  -2.115   0.0420 *  
mid_band     0.124022   0.058472   2.121   0.0415 *  
high_band    0.051080   0.082001   0.623   0.5376    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.218 on 33 degrees of freedom
Multiple R-squared:  0.2539,    Adjusted R-squared:  0.1861 
F-statistic: 3.743 on 3 and 33 DF,  p-value: 0.02029
# Combo group
combo <- combo %>%
  mutate(
    low_band = (mean_lag_V2 + mean_lag_V3) / 2,
    mid_band = (mean_lag_V4 + mean_lag_V5) / 2,
    high_band = (mean_lag_V6 + mean_lag_V7 + mean_lag_V8) / 3
  )
model_combo_simple <- lm(mean_weekly_max_velocity ~ low_band + mid_band + high_band, data = combo)
summary(model_combo_simple)

Call:
lm(formula = mean_weekly_max_velocity ~ low_band + mid_band + 
    high_band, data = combo)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.86355 -0.34167  0.06995  0.46065  1.47691 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.794e+01  3.691e-01  48.606   <2e-16 ***
low_band    -7.850e-03  4.452e-03  -1.763   0.0877 .  
mid_band     2.258e-02  1.956e-02   1.154   0.2573    
high_band    2.952e-05  5.154e-02   0.001   0.9995    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.7945 on 31 degrees of freedom
  (1 observation deleted due to missingness)
Multiple R-squared:  0.1658,    Adjusted R-squared:  0.0851 
F-statistic: 2.054 on 3 and 31 DF,  p-value: 0.1267

Bigs model: Statistically significant overall and explains around 33% of the variation of max velocity. The low band is significantly negative. Every unit increase in the low band results in a decrease in max velocity. The mid and high bands are not significant but the high band shows a positive trend. Based on this model, for bigs, more low-band effort may reduce weekly top speed, possibly indicating underexposure of high speeds. High-band efforts might help, but aren’t clearly impactful in this group.

Skill model: Almost significant. Low band has a negative effect, mid band has a positive effect, and high band is not significant. Based on this model, for skill players, more low-intensity exposure seems detrimental to max speed, while moderate band (V5–V6) exposure may enhance it.

Combo model: Model is not significant. The combo position group contains many different type of positions (QBs, RBs, TEs, LBs) with different movement and velocities. This variability may obscure relationships.

Across groups, low-band exposure is consistently negatively related to peak weekly speed — suggesting that too much low-intensity work may reduce the capacity to reach high speeds.

Are relative efforts and bands more advantageous than the absolute bands provided?

Look at ID_11’s plot from the last question. This player is an offensive lineman. Looking at the plot, he is mostly in band 2 and 3 while never reaching bands 7 or 8 and rarely reaching band 6. Despite this, this athlete’s weekly max velocity is always at least 75% of their all-time max velocity. Only looking at the absolute bands that are provided, we might come to the conclusion that ID_11 is not achieving high running speeds, however after looking at his max velocity efforts relative to his all-time max velocity, he is reaching high running speeds.

Create Relative Bands for ID_11

# Create Relative Bands for ID_11

# Prepare ID_11 dataset with relative bands
ID_11 <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>% 
  filter(anon_id == "ID_11", Maximum.Velocity != 0) %>%
  filter(!is.na(Maximum.Velocity)) %>%
  mutate(
    week = floor_date(Date, unit = "week"),
    pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100,
    # Create Relative Bands
    relative_band = cut(
      pct_of_max,
      breaks = c(20, 40, 50, 60, 70, 80, 90, 100),
      labels = c("V2 (20-40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"),
      right = TRUE, # Set to TRUE to include 100% in band
      include.lowest = TRUE  # Includes 0 in first band
    ),
    # Reverse factor levels so highest band is on top in plot
  relative_band = factor(
    relative_band,
    levels = rev(c("V2 (20-40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
  )
  )

# Summarize counts per week and relative band
weekly_effort <- ID_11 %>%
  filter(!is.na(relative_band)) %>%
  group_by(week, relative_band) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(week) %>%
  mutate(pct_effort = (count / sum(count)) * 100) %>%
  ungroup()

# Pivot wider to get percentages per band as columns
weekly_effort_wide <- weekly_effort %>%
  select(week, relative_band, pct_effort) %>%
  pivot_wider(names_from = relative_band, values_from = pct_effort, values_fill = 0)

# Calculate weekly max velocity % of player max velocity
weekly_max_velocity <- ID_11 %>%
  group_by(week) %>%
  summarise(pct_of_max_velocity = max(pct_of_max), .groups = "drop")

# Join the max velocity % back to wide effort data
weekly_effort_wide <- weekly_effort_wide %>%
  left_join(weekly_max_velocity, by = "week")

# Pivot longer for stacked bar plotting
weekly_effort_long <- weekly_effort_wide %>%
  pivot_longer(
    cols = c("V2 (20-40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"),
    names_to = "relative_band",
    values_to = "pct_effort"
  ) %>%
  mutate(
    relative_band = factor(
      relative_band,
      levels = rev(c("V2 (20-40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
    )
  )
# Plot Relative Bands for ID_11
ggplot(weekly_effort_long, aes(x = week, y = pct_effort, fill = relative_band)) +
  geom_col(position = "stack") +
  geom_line(aes(y = pct_of_max_velocity, group = 1), 
            color = "black", size = 1.2) +
  geom_point(aes(y = pct_of_max_velocity), 
             color = "black", size = 2) +
  scale_fill_manual(
    values = c(
      "V2 (20-40%)" = "darkgreen",
      "V3 (40-50%)" = "green2",
      "V4 (50-60%)" = "greenyellow",
      "V5 (60-70%)" = "yellow",
      "V6 (70-80%)" = "orange",
      "V7 (80-90%)" = "tomato",
      "V8 (90-100%)" = "darkred"
    )
  ) +
  labs(
    title = "Relative Velocity Band Effort % per Week, ID_11",
    x = "Week",
    y = "Percent of Weekly Efforts",
    fill = "Relative Velocity Band"
  ) + 
  scale_y_continuous(
    sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
  ) +
  theme_classic()

Comparing this with the absolute bands graph of ID_11, we can see that this is a much better representation of their running speeds.

Relative Bands for each athlete

# Function to create relative bands and plot these for each athlete

# Arrange data so that the plot will display in anon_id order
clean_anons <- Catapult_Session_clean %>%
  arrange(anon_id)

# Loop over all athletes and create their relative bands plots
unique(clean_anons$anon_id) %>%
  lapply(function(player_id) {
    # Prepare data for player
    player_data <- Catapult_Session_clean %>%
      filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
      filter(anon_id == player_id, Maximum.Velocity != 0, !is.na(Maximum.Velocity)) %>%
      mutate(
        week = floor_date(Date, unit = "week"),
        pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100,
        # Create Relative Bands
        relative_band = cut(
          pct_of_max,
          breaks = c(20, 40, 50, 60, 70, 80, 90, 100),
          labels = c("V2 (20-40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
                     "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"),
          right = TRUE, # Set to true to include 100%
          include.lowest = TRUE  # Set to true to include 0 in 1st interval
        ),
        # Reverse factor levels so highest band is on top in plot
        relative_band = factor(
          relative_band,
          levels = rev(c("V2 (20-40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
                         "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
        )
      )
    
    if (nrow(player_data) == 0) return(NULL)  # skip empty

    # Summarize counts per week and relative band
    weekly_effort <- player_data %>%
      filter(!is.na(relative_band)) %>%
      group_by(week, relative_band) %>%
      summarise(count = n(), .groups = "drop") %>%
      group_by(week) %>%
      mutate(pct_effort = (count / sum(count)) * 100) %>%
      ungroup()

    # Pivot wider to get percentages per band as columns
    weekly_effort_wide <- weekly_effort %>%
      select(week, relative_band, pct_effort) %>%
      pivot_wider(names_from = relative_band, values_from = pct_effort, values_fill = 0)

    # Calculate weekly max velocity % of player max velocity
    weekly_max_velocity <- player_data %>%
      group_by(week) %>%
      summarise(pct_of_max_velocity = max(pct_of_max), .groups = "drop")

    # Join
    weekly_effort_wide <- weekly_effort_wide %>%
      left_join(weekly_max_velocity, by = "week")
    
    # Pivot longer for stacked bar plotting
    weekly_effort_long <- weekly_effort_wide %>%
      pivot_longer(
          cols = -c(week, pct_of_max_velocity),
        names_to = "relative_band",
        values_to = "pct_effort"
      ) %>%
      mutate(
        relative_band = factor(
          relative_band,
          levels = rev(c("V2 (20-40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
                         "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
        )
      )
    
    # Plot
    plot <- ggplot(weekly_effort_long, aes(x = week, y = pct_effort, fill = relative_band)) +
      geom_col(position = "stack") +
      geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
      geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
      scale_fill_manual(
        values = c(
          "V2 (20-40%)" = "darkgreen",
          "V3 (40-50%)" = "green2",
          "V4 (50-60%)" = "greenyellow",
          "V5 (60-70%)" = "yellow",
          "V6 (70-80%)" = "orange",
          "V7 (80-90%)" = "tomato",
          "V8 (90-100%)" = "darkred"
        )
      ) +
      scale_y_continuous(
        name = "Percent of Weekly Effort",
        sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
      ) +
      labs(
        title = paste("Relative Velocity Band Effort % and Max Velocity Trend for Player", player_id),
        x = "Week",
        fill = "Relative Velocity Band"
      ) +
      theme_classic()

    print(plot)
    
    return(NULL)
  })
[[1]]
NULL

[[2]]
NULL

[[3]]
NULL

[[4]]
NULL

[[5]]
NULL

[[6]]
NULL

[[7]]
NULL

[[8]]
NULL

[[9]]
NULL

[[10]]
NULL

[[11]]
NULL

[[12]]
NULL

[[13]]
NULL

[[14]]
NULL

[[15]]
NULL

[[16]]
NULL

[[17]]
NULL

[[18]]
NULL

[[19]]
NULL

[[20]]
NULL

[[21]]
NULL

[[22]]
NULL

[[23]]
NULL

[[24]]
NULL

[[25]]
NULL

[[26]]
NULL

[[27]]
NULL

[[28]]
NULL

[[29]]
NULL

[[30]]
NULL

[[31]]
NULL

[[32]]
NULL

[[33]]
NULL

[[34]]
NULL

[[35]]
NULL

[[36]]
NULL

[[37]]
NULL

[[38]]
NULL

[[39]]
NULL

[[40]]
NULL

[[41]]
NULL

[[42]]
NULL

[[43]]
NULL

[[44]]
NULL

[[45]]
NULL

[[46]]
NULL

[[47]]
NULL

[[48]]
NULL

[[49]]
NULL

[[50]]
NULL

[[51]]
NULL

[[52]]
NULL

[[53]]
NULL

[[54]]
NULL

[[55]]
NULL

[[56]]
NULL

[[57]]
NULL

[[58]]
NULL

[[59]]
NULL

[[60]]
NULL

[[61]]
NULL

[[62]]
NULL

[[63]]
NULL

[[64]]
NULL

[[65]]
NULL

[[66]]
NULL

[[67]]
NULL

[[68]]
NULL

[[69]]
NULL

[[70]]
NULL

[[71]]
NULL

[[72]]
NULL

[[73]]
NULL

[[74]]
NULL

[[75]]
NULL

[[76]]
NULL

[[77]]
NULL

[[78]]
NULL

[[79]]
NULL

[[80]]
NULL

[[81]]
NULL

[[82]]
NULL

[[83]]
NULL

[[84]]
NULL

[[85]]
NULL

[[86]]
NULL

[[87]]
NULL

[[88]]
NULL

[[89]]
NULL

[[90]]
NULL

[[91]]
NULL

[[92]]
NULL

[[93]]
NULL

[[94]]
NULL

[[95]]
NULL

[[96]]
NULL

[[97]]
NULL

[[98]]
NULL

[[99]]
NULL

[[100]]
NULL

[[101]]
NULL

[[102]]
NULL

[[103]]
NULL

[[104]]
NULL

# Create relative bands for all athletes

# Define the velocity band labels
band_levels <- c("V2 (20-40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
                     "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)")

# Create exposure data frame for all players
weekly_sprint_exposure <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
  filter(Maximum.Velocity != 0, !is.na(Maximum.Velocity)) %>%
  mutate(
    week = floor_date(Date, unit = "week"),
    pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100,
    relative_band = cut(
      pct_of_max,
      breaks = c(20, 40, 50, 60, 70, 80, 90, 100.1),
      labels = band_levels,
      right = TRUE,  # will include 100
      include.lowest = TRUE
    ),
    relative_band = factor(relative_band, levels = band_levels)
  ) %>%
  filter(!is.na(relative_band)) %>%
  group_by(anon_id, week, relative_band) %>%
  summarise(effort_count = n(), .groups = "drop") %>%
  group_by(anon_id, week) %>%
  mutate(
    total_efforts = sum(effort_count),
    pct_effort = (effort_count / total_efforts) * 100
  ) %>%
  ungroup() %>%
  select(anon_id, week, relative_band, pct_effort) %>%
  pivot_wider(
    names_from = relative_band,
    values_from = pct_effort,
    values_fill = 0
  )

# Add weekly max velocity % of max
weekly_max_pct <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
  filter(Maximum.Velocity != 0, !is.na(Maximum.Velocity)) %>%
  mutate(
    week = floor_date(Date, unit = "week"),
    pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100
  ) %>%
  group_by(anon_id, week) %>%
  summarise(pct_of_max_velocity = max(pct_of_max, na.rm = TRUE), .groups = "drop")

# Final data set
relative_bands <- weekly_sprint_exposure %>%
  left_join(weekly_max_pct, by = c("anon_id", "week"))

# Convert to long for plotting
relative_bands_long <- relative_bands %>%
  pivot_longer(
    cols = starts_with("V"),
    names_to = "relative_band",
    values_to = "pct_effort"
  ) %>%
  mutate(
    relative_band = factor(relative_band, levels = band_levels)
  )
COMBO <- c("QB","LB","TE","RB", "ILB")
BIG <- c("OL", "DL", "DE", "DT")
SKILL <- c("WR", "DB", "CB", "SAF")
Positions <- c("COMBO", "BIG", "SKILL")

weekly_sprint_exposure <- left_join(weekly_sprint_exposure, player_positions, by = "anon_id", relationship = "many-to-many")

weekly_sprint_exposure <- weekly_sprint_exposure %>%
  mutate(Position = case_when(Primary.Position %in% COMBO ~ "COMBO",
                               Primary.Position %in% BIG ~ "BIG",
                               Primary.Position %in% SKILL ~ "SKILL"))

weekly_sprint_exposure <- distinct(weekly_sprint_exposure)

max_velocities <- weekly_velocity_efforts[,c("anon_id","week","pct_of_max_velocity")]

weekly_sprint_exposure <- left_join(weekly_sprint_exposure, max_velocities, by = c("anon_id", "week"), relationship="many-to-many")




sprint_exposure_big <- weekly_sprint_exposure %>%
  filter(Position == "BIG") %>%
  group_by(week) %>%
  mutate(avg_pct_max = mean(pct_of_max_velocity),
         avg_V8 = mean(`V8 (90-100%)`),
         avg_V7 = mean(`V7 (80-90%)`),
         avg_V6 = mean(`V6 (70-80%)`),
         avg_V5 = mean(`V5 (60-70%)`),
         avg_V4 = mean(`V4 (50-60%)`),
         avg_V3 = mean(`V3 (40-50%)`),
         avg_V2 = mean(`V2 (20-40%)`)) %>%
  ungroup() %>%
  na.omit()

sprint_exposure_combo <- weekly_sprint_exposure %>%
  filter(Position == "COMBO")  %>%
  group_by(week) %>%
  mutate(avg_pct_max = mean(pct_of_max_velocity),
         avg_V8 = mean(`V8 (90-100%)`),
         avg_V7 = mean(`V7 (80-90%)`),
         avg_V6 = mean(`V6 (70-80%)`),
         avg_V5 = mean(`V5 (60-70%)`),
         avg_V4 = mean(`V4 (50-60%)`),
         avg_V3 = mean(`V3 (40-50%)`),
         avg_V2 = mean(`V2 (20-40%)`)) %>%
  ungroup() %>%
  na.omit()

sprint_exposure_skill <- weekly_sprint_exposure %>%
  filter(Position == "SKILL")  %>%
  group_by(week) %>%
  mutate(avg_pct_max = mean(pct_of_max_velocity),
         avg_V8 = mean(`V8 (90-100%)`),
         avg_V7 = mean(`V7 (80-90%)`),
         avg_V6 = mean(`V6 (70-80%)`),
         avg_V5 = mean(`V5 (60-70%)`),
         avg_V4 = mean(`V4 (50-60%)`),
         avg_V3 = mean(`V3 (40-50%)`),
         avg_V2 = mean(`V2 (20-40%)`)) %>%
  ungroup() %>%
  na.omit()
  

When I built the models that considered the relative bands instead of the absolute bands provided, I found that initially, the results were a lot better than the previous models with the absolute bands. This suggests that the relative bounds which are segmented into 10% chunks of all time maximum velocity for each player is a lot more indicative of effort and therefore their percentage of maximum velocity in a given week. We are able to see though that there is a lot of multicollinearity within the relative bands calculated.

# Modeling

# Bigs Model
model_big <- lm(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
                data=sprint_exposure_big)
summary(model_big)

Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V3 + avg_V4 + avg_V5 + 
    avg_V6 + avg_V7 + avg_V8, data = sprint_exposure_big)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.3536 -2.0453 -0.4238  3.0271  7.3825 

Coefficients: (1 not defined because of singularities)
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 94.419490   2.672085  35.336  < 2e-16 ***
avg_V2       0.109035   0.042088   2.591  0.00979 ** 
avg_V3      -0.184292   0.031633  -5.826 8.80e-09 ***
avg_V4      -0.525727   0.065265  -8.055 3.59e-15 ***
avg_V5      -0.007007   0.026275  -0.267  0.78981    
avg_V6      -0.156410   0.026333  -5.940 4.57e-09 ***
avg_V7      -0.101020   0.051665  -1.955  0.05096 .  
avg_V8             NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.395 on 675 degrees of freedom
Multiple R-squared:  0.5617,    Adjusted R-squared:  0.5579 
F-statistic: 144.2 on 6 and 675 DF,  p-value: < 2.2e-16
# Skill Model
model_skill <- lm(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
                data=sprint_exposure_skill)
summary(model_skill)

Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V3 + avg_V4 + avg_V5 + 
    avg_V6 + avg_V7 + avg_V8, data = sprint_exposure_skill)

Residuals:
     Min       1Q   Median       3Q      Max 
-21.5539  -1.5243   0.8314   2.2824   4.5808 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 97.88743    1.00261  97.632  < 2e-16 ***
avg_V2      -0.31316    0.02476 -12.648  < 2e-16 ***
avg_V3       0.11433    0.04046   2.826  0.00485 ** 
avg_V4      -0.20598    0.02375  -8.672  < 2e-16 ***
avg_V5       0.07949    0.02860   2.779  0.00560 ** 
avg_V6      -0.17947    0.02233  -8.038 4.01e-15 ***
avg_V7      -0.12491    0.01580  -7.905 1.08e-14 ***
avg_V8            NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.474 on 684 degrees of freedom
Multiple R-squared:  0.295, Adjusted R-squared:  0.2888 
F-statistic:  47.7 on 6 and 684 DF,  p-value: < 2.2e-16
# Combo Model
model_combo <- lm(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
               data=sprint_exposure_combo)
summary(model_combo)

Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V3 + avg_V4 + avg_V5 + 
    avg_V6 + avg_V7 + avg_V8, data = sprint_exposure_combo)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.8241 -2.4566 -0.1927  2.0818  5.2847 

Coefficients: (1 not defined because of singularities)
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 106.96461    1.82898  58.483  < 2e-16 ***
avg_V2       -0.45039    0.03330 -13.526  < 2e-16 ***
avg_V3       -0.16937    0.04751  -3.565   0.0004 ***
avg_V4       -0.23402    0.03326  -7.036 6.71e-12 ***
avg_V5        0.05042    0.03436   1.467   0.1429    
avg_V6       -0.42216    0.02510 -16.818  < 2e-16 ***
avg_V7       -0.23244    0.02979  -7.802 3.72e-14 ***
avg_V8             NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.967 on 488 degrees of freedom
Multiple R-squared:  0.4771,    Adjusted R-squared:  0.4706 
F-statistic:  74.2 on 6 and 488 DF,  p-value: < 2.2e-16

Looking at the correlations between all of the predictors as well as the response variable in the model we can see that a lot of the predictors have super strong correlations which each other, some more so than they are with the response. This suggests that the issue with the models above is that multicollinearity is bogging them down and we are not seeing the true relationships between the predictors and the response.

cor(sprint_exposure_big[,13:20])
            avg_pct_max     avg_V8     avg_V7     avg_V6      avg_V5     avg_V4      avg_V3
avg_pct_max   1.0000000  0.4865222  0.6445407  0.3192494 -0.31031882 -0.6997373 -0.54003451
avg_V8        0.4865222  1.0000000  0.6952135  0.1432250 -0.48309810 -0.5621637 -0.77382752
avg_V7        0.6445407  0.6952135  1.0000000  0.5888598 -0.32557565 -0.9396666 -0.84733973
avg_V6        0.3192494  0.1432250  0.5888598  1.0000000  0.15143484 -0.6757628 -0.66059705
avg_V5       -0.3103188 -0.4830981 -0.3255756  0.1514348  1.00000000  0.3001174  0.06603157
avg_V4       -0.6997373 -0.5621637 -0.9396666 -0.6757628  0.30011740  1.0000000  0.78000601
avg_V3       -0.5400345 -0.7738275 -0.8473397 -0.6605971  0.06603157  0.7800060  1.00000000
avg_V2       -0.3574179 -0.6620986 -0.7855184 -0.6700407 -0.08266465  0.6539145  0.89174861
                 avg_V2
avg_pct_max -0.35741790
avg_V8      -0.66209858
avg_V7      -0.78551843
avg_V6      -0.67004071
avg_V5      -0.08266465
avg_V4       0.65391446
avg_V3       0.89174861
avg_V2       1.00000000
cor(sprint_exposure_skill[,13:20])
            avg_pct_max     avg_V8     avg_V7      avg_V6     avg_V5     avg_V4     avg_V3
avg_pct_max  1.00000000  0.3391430 -0.0781862  0.03905024  0.1254586 -0.2050365 -0.1191009
avg_V8       0.33914302  1.0000000  0.2663767 -0.24015312 -0.4838171 -0.6142101 -0.6069156
avg_V7      -0.07818620  0.2663767  1.0000000 -0.18801908 -0.7375266 -0.5562658 -0.6046100
avg_V6       0.03905024 -0.2401531 -0.1880191  1.00000000  0.4531227 -0.1796023 -0.2246013
avg_V5       0.12545861 -0.4838171 -0.7375266  0.45312268  1.0000000  0.3666557  0.3744037
avg_V4      -0.20503653 -0.6142101 -0.5562658 -0.17960230  0.3666557  1.0000000  0.6875933
avg_V3      -0.11910094 -0.6069156 -0.6046100 -0.22460135  0.3744037  0.6875933  1.0000000
avg_V2      -0.30920757 -0.4993300 -0.5159128 -0.31075378  0.2152760  0.4795015  0.6905731
                avg_V2
avg_pct_max -0.3092076
avg_V8      -0.4993300
avg_V7      -0.5159128
avg_V6      -0.3107538
avg_V5       0.2152760
avg_V4       0.4795015
avg_V3       0.6905731
avg_V2       1.0000000
cor(sprint_exposure_combo[,13:20])
            avg_pct_max      avg_V8     avg_V7      avg_V6      avg_V5     avg_V4     avg_V3
avg_pct_max   1.0000000  0.55044191  0.2831128 -0.11308822 -0.12549739 -0.3390832 -0.2777397
avg_V8        0.5504419  1.00000000  0.6982032  0.04492787 -0.66014337 -0.7494741 -0.6822165
avg_V7        0.2831128  0.69820315  1.0000000  0.27601693 -0.69690768 -0.8447070 -0.8649314
avg_V6       -0.1130882  0.04492787  0.2760169  1.00000000  0.08101263 -0.4838699 -0.5455592
avg_V5       -0.1254974 -0.66014337 -0.6969077  0.08101263  1.00000000  0.4734404  0.4645439
avg_V4       -0.3390832 -0.74947414 -0.8447070 -0.48386990  0.47344043  1.0000000  0.8406075
avg_V3       -0.2777397 -0.68221648 -0.8649314 -0.54555921  0.46454387  0.8406075  1.0000000
avg_V2       -0.3915264 -0.68029067 -0.7763047 -0.60519674  0.36233033  0.7936349  0.8353624
                avg_V2
avg_pct_max -0.3915264
avg_V8      -0.6802907
avg_V7      -0.7763047
avg_V6      -0.6051967
avg_V5       0.3623303
avg_V4       0.7936349
avg_V3       0.8353624
avg_V2       1.0000000

All of the models below are the best 3 predictor models that resulted for each position after running the best subsets algorithm. All of the following have reductions in adjusted-\(R^2\) but they don’t seem significant considering that we took out over half of the predictors and maintained a relatively similar adjusted-\(R^2\) value.

#Bigs

#best_sub_big <- regsubsets(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
#                data=sprint_exposure_big, method="exhaustive")
#summary(best_sub_big)

model_big <- lm(avg_pct_max~avg_V2+avg_V6+avg_V7,
                data=sprint_exposure_big)
summary(model_big)

Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V6 + avg_V7, data = sprint_exposure_big)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.6953 -2.8762 -0.6379  2.3819  8.4553 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 72.52999    0.68993 105.126  < 2e-16 ***
avg_V2       0.19119    0.02311   8.272 6.96e-16 ***
avg_V6       0.01949    0.01961   0.994     0.32    
avg_V7       0.33232    0.01600  20.775  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.71 on 678 degrees of freedom
Multiple R-squared:  0.4741,    Adjusted R-squared:  0.4718 
F-statistic: 203.7 on 3 and 678 DF,  p-value: < 2.2e-16
#Skills

#best_sub_skill <- regsubsets(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
#                data=sprint_exposure_skill, method="exhaustive")
#summary(best_sub_skill)

model_skill <- lm(avg_pct_max~avg_V2+avg_V5+avg_V6,
                data=sprint_exposure_skill)
summary(model_skill)

Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V5 + avg_V6, data = sprint_exposure_skill)

Residuals:
    Min      1Q  Median      3Q     Max 
-34.619  -1.565   0.184   2.381   5.526 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 90.35401    0.51443 175.641  < 2e-16 ***
avg_V2      -0.23848    0.02062 -11.565  < 2e-16 ***
avg_V5       0.17951    0.02236   8.029 4.28e-15 ***
avg_V6      -0.13557    0.02285  -5.934 4.70e-09 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.746 on 687 degrees of freedom
Multiple R-squared:  0.1765,    Adjusted R-squared:  0.1729 
F-statistic: 49.07 on 3 and 687 DF,  p-value: < 2.2e-16
#Combos
#best_sub_combo <- regsubsets(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
#                data=sprint_exposure_combo, method="exhaustive")
#summary(best_sub_combo)

model_combo <- lm(avg_pct_max~avg_V2+avg_V4+avg_V5,
               data=sprint_exposure_combo)
summary(model_combo)

Call:
lm(formula = avg_pct_max ~ avg_V2 + avg_V4 + avg_V5, data = sprint_exposure_combo)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.2418 -2.4661  0.6291  2.5272  6.9210 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 86.98522    0.50775 171.315  < 2e-16 ***
avg_V2      -0.15548    0.03217  -4.833  1.8e-06 ***
avg_V4      -0.04478    0.03349  -1.337    0.182    
avg_V5       0.02630    0.03136   0.839    0.402    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.757 on 491 degrees of freedom
Multiple R-squared:  0.1567,    Adjusted R-squared:  0.1515 
F-statistic: 30.41 on 3 and 491 DF,  p-value: < 2.2e-16

Since the models perform roughly the same with just 3 predictors as they do with all 7, it may make sense to instead truncate into more general bins. This may help us understand the relationship with low, medium and high effort with weekly maximum velocity.

sprint_exposure_big <- sprint_exposure_big %>%
  mutate(avg_low = (avg_V2+avg_V3)/2,
         avg_medium = (avg_V4+avg_V5+avg_V6)/3,
         avg_high = (avg_V7+avg_V8)/2)

sprint_exposure_skill <- sprint_exposure_skill %>%
  mutate(avg_low = (avg_V2+avg_V3)/2,
         avg_medium = (avg_V4+avg_V5+avg_V6)/3,
         avg_high = (avg_V7+avg_V8)/2)

sprint_exposure_combo <- sprint_exposure_combo %>%
  mutate(avg_low = (avg_V2+avg_V3)/2,
         avg_medium = (avg_V4+avg_V5+avg_V6)/3,
         avg_high = (avg_V7+avg_V8)/2)
#bigs
model_big_general <- lm(avg_pct_max~avg_low+avg_medium+avg_high,
                        data=sprint_exposure_big)
summary(model_big_general)

Call:
lm(formula = avg_pct_max ~ avg_low + avg_medium + avg_high, data = sprint_exposure_big)

Residuals:
   Min     1Q Median     3Q    Max 
-8.632 -1.791 -0.790  2.369  8.653 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 91.70273    0.58548  156.63   <2e-16 ***
avg_low     -0.21289    0.01412  -15.08   <2e-16 ***
avg_medium  -0.50797    0.03495  -14.54   <2e-16 ***
avg_high          NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.959 on 679 degrees of freedom
Multiple R-squared:  0.4004,    Adjusted R-squared:  0.3987 
F-statistic: 226.7 on 2 and 679 DF,  p-value: < 2.2e-16
#skills
model_skill_general <- lm(avg_pct_max~avg_low+avg_medium+avg_high,
                        data=sprint_exposure_skill)
summary(model_skill_general)

Call:
lm(formula = avg_pct_max ~ avg_low + avg_medium + avg_high, data = sprint_exposure_skill)

Residuals:
    Min      1Q  Median      3Q     Max 
-34.673  -1.718   0.784   1.955   7.801 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 88.90869    0.49464 179.743  < 2e-16 ***
avg_low     -0.17498    0.02524  -6.933 9.48e-12 ***
avg_medium   0.04499    0.02907   1.547    0.122    
avg_high          NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.987 on 688 degrees of freedom
Multiple R-squared:  0.06574,   Adjusted R-squared:  0.06302 
F-statistic:  24.2 on 2 and 688 DF,  p-value: 6.944e-11
#combos
model_combo_general <- lm(avg_pct_max~avg_low+avg_medium+avg_high,
                        data=sprint_exposure_combo)
summary(model_combo_general)

Call:
lm(formula = avg_pct_max ~ avg_low + avg_medium + avg_high, data = sprint_exposure_combo)

Residuals:
    Min      1Q  Median      3Q     Max 
-9.1658 -2.0159  0.3012  2.3565  7.2075 

Coefficients: (1 not defined because of singularities)
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 90.40529    0.66256 136.449  < 2e-16 ***
avg_low     -0.13407    0.02322  -5.773 1.38e-08 ***
avg_medium  -0.24183    0.04095  -5.906 6.56e-09 ***
avg_high          NA         NA      NA       NA    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.694 on 492 degrees of freedom
Multiple R-squared:  0.1828,    Adjusted R-squared:  0.1795 
F-statistic: 55.03 on 2 and 492 DF,  p-value: < 2.2e-16

Still pretty bad, I am going to use principle components regression to see if there are other relationships we are missing

preds <- sprint_exposure_big[,14:20]

pc_loadings <- prcomp(preds, scale=TRUE)$rotation[,c(1,2,3)]
pc_loadings
              PC1         PC2        PC3
avg_V8  0.3589784 -0.39846525 -0.5094340
avg_V7  0.4469184 -0.08953137  0.1575945
avg_V6  0.3272167  0.45491322  0.4580859
avg_V5 -0.1106036  0.75421813 -0.3762755
avg_V4 -0.4230166  0.03282438 -0.4224436
avg_V3 -0.4468487 -0.08929811  0.2488676
avg_V2 -0.4165066 -0.21992488  0.3518826

How does sprinting exposure (# of efforts, % max reached) relate to incidence of hamstring injuries?

# Join relative_bands data set with injury data

Incident_dates <- Incident_Report_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01"))

# Get ids of athletes with injury
injured_ids <- unique(Incident_dates$anon_id)

# Filter relative_bands to only include athletes who got injured
injured_data <- relative_bands_long %>%
  filter(anon_id %in% injured_ids)

# Create injury weeks
injury_weeks <- Incident_Report_clean %>%
  mutate(
    week = floor_date(as.Date(Date.of.Injury), unit = "week")
  ) %>%
  filter(Date.of.Injury >= as.Date("2024-06-30") & Date.of.Injury <= as.Date("2025-07-01")) %>%
  select(anon_id, week) %>%
  distinct() %>%
  mutate(injury = 1)

injuries_and_running <- injured_data %>%
  left_join(injury_weeks, by = c("anon_id", "week")) %>%
  mutate(injury = ifelse(is.na(injury), 0, injury))

injuries_and_running <- injuries_and_running %>%
  mutate(relative_band = factor(relative_band, levels = band_levels))


# Filter to only include players 194 and 285
plot_data <- injuries_and_running %>%
  filter(anon_id %in% c("ID_194", "ID_285"))

# Identify injury weeks for shading
injury_shading <- plot_data %>%
  filter(injury == 1) %>%
  mutate(
    xmin = week - 3,
    xmax = week + 3,
    ymin = -Inf,
    ymax = Inf
  )

# Create a single faceted plot
ggplot(plot_data, aes(x = week)) +
  geom_rect(data = injury_shading,
            aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
            fill = "blue", alpha = 0.2, inherit.aes = FALSE) +
  geom_col(aes(y = pct_effort, fill = relative_band), position = position_stack(reverse = TRUE)) +
  geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
  geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
  facet_wrap(~ anon_id, scales = "free_x") +
  scale_fill_manual(
    values = c(
      "V2 (20-40%)" = "darkgreen",
      "V3 (40-50%)" = "green2",
      "V4 (50-60%)" = "greenyellow",
      "V5 (60-70%)" = "yellow",
      "V6 (70-80%)" = "orange",
      "V7 (80-90%)" = "tomato",
      "V8 (90-100%)" = "darkred"
    )
  ) +
  scale_y_continuous(
    name = "Percent of Weekly Effort",
    sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
  ) +
  labs(
    title = "Velocity Band Exposure & Max % for Injured Players 194 and 285",
    x = "Week",
    fill = "Relative Velocity Band"
  ) +
  theme_classic()

# Loop over injured athletes and create their plots

unique(injuries_and_running$anon_id) %>%
  lapply(function(player_id) {
    
    # Filter data for this player
    plot_data <- injuries_and_running %>%
      filter(anon_id == player_id)
    
    if (nrow(plot_data) == 0) return(NULL)  # Skip if no data
    
    # Get injury weeks
    injury_week_nums <- plot_data %>%
      filter(injury == 1) %>%
      pull(week)
    
    # Plot
    plot <- ggplot(plot_data, aes(x = week)) +
      # Highlight injury weeks with shaded blue areas
      geom_rect(data = data.frame(week = injury_week_nums),
                aes(xmin = week - 3, xmax = week + 3, ymin = -Inf, ymax = Inf),
                fill = "blue", alpha = 0.2, inherit.aes = FALSE) +
      geom_col(aes(y = pct_effort, fill = relative_band), position = position_stack(reverse = TRUE)) +
      geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
      geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
      scale_fill_manual(
        values = c(
          "V2 (20-40%)" = "darkgreen",
          "V3 (40-50%)" = "green2",
          "V4 (50-60%)" = "greenyellow",
          "V5 (60-70%)" = "yellow",
          "V6 (70-80%)" = "orange",
          "V7 (80-90%)" = "tomato",
          "V8 (90-100%)" = "darkred"
        )
      ) +
      scale_y_continuous(
        name = "Percent of Weekly Effort",
        sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
      ) +
      labs(
        title = paste("Velocity Band Exposure & Max % for Injured Player", player_id),
        x = "Week",
        fill = "Relative Velocity Band"
      ) +
      theme_classic()
    
    print(plot)
    
    return(NULL)
  })
[[1]]
NULL

[[2]]
NULL

[[3]]
NULL

[[4]]
NULL

[[5]]
NULL

[[6]]
NULL

[[7]]
NULL

[[8]]
NULL

[[9]]
NULL

[[10]]
NULL

[[11]]
NULL

[[12]]
NULL

[[13]]
NULL

[[14]]
NULL

[[15]]
NULL

The blue lines indicate a week where an injury occurred. No real insights can be made from looking at relative velocity bands and injury occurrances.

model_injury <- glm(injury ~ pct_of_max_velocity,
                    data = injuries_and_running,
                    family = "binomial")

summary(model_injury)

Call:
glm(formula = injury ~ pct_of_max_velocity, family = "binomial", 
    data = injuries_and_running)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)         -3.71775    0.90526  -4.107 4.01e-05 ***
pct_of_max_velocity  0.00923    0.01041   0.887    0.375    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 939.85  on 2330  degrees of freedom
Residual deviance: 939.03  on 2329  degrees of freedom
AIC: 943.03

Number of Fisher Scoring iterations: 5

Simple model Each 1 percentage point increase in pct_of_max_velocity increases the log-odds of injury by 0.00923. Not significant.

# Get summary stats for both injury and non injury

# Clean and get date format
injuries_and_running_clean <- injuries_and_running %>%
  mutate(week_formatted = format(as.Date(week), "%m-%d-%Y")) %>%
  distinct(anon_id, week, .keep_all = TRUE)

# Filter for only injury weeks
injury_weeks <- injuries_and_running_clean %>%
  filter(injury == 1) %>%
  mutate(injury_event = paste0(anon_id, "__", week_formatted))

# Calculate statistics for injury weeks
injury_mean <- mean(injury_weeks$pct_of_max_velocity, na.rm = TRUE)

injury_summary_stats <- injury_weeks %>%
  summarise(
    mean_pct = mean(pct_of_max_velocity, na.rm = TRUE),
    sd_pct = sd(pct_of_max_velocity, na.rm = TRUE),
    n = sum(!is.na(pct_of_max_velocity))
  )
# Confidence interval 
se <- injury_summary_stats$sd_pct / sqrt(injury_summary_stats$n)
t_crit <- qt(0.975, df = injury_summary_stats$n - 1)
lower <- injury_summary_stats$mean_pct - t_crit * se
upper <- injury_summary_stats$mean_pct + t_crit * se
# Print
cat("95% CI for mean pct_of_max_velocity:", round(lower,2), "-", round(upper,2))
95% CI for mean pct_of_max_velocity: 82.52 - 90.59
# Filter for non-injury weeks
non_injury_weeks <- injuries_and_running_clean %>%
  filter(injury == 0)

# Get statistics for non-injury weeks
non_injury_mean <- mean(non_injury_weeks$pct_of_max_velocity, na.rm = TRUE)

non_injury_summary_stats <- non_injury_weeks %>%
  summarise(
    mean_pct = mean(pct_of_max_velocity, na.rm = TRUE),
    sd_pct = sd(pct_of_max_velocity, na.rm = TRUE),
    n = sum(!is.na(pct_of_max_velocity))
  )

# Confidence interval
non_injury_se <- non_injury_summary_stats$sd_pct / sqrt(non_injury_summary_stats$n)
non_injury_t_crit <- qt(0.975, df = non_injury_summary_stats$n - 1)
non_injury_lower <- non_injury_summary_stats$mean_pct - non_injury_t_crit * non_injury_se
non_injury_upper <- non_injury_summary_stats$mean_pct + non_injury_t_crit * non_injury_se

# Print results
cat("95% CI for mean pct_of_max_velocity (non-injury weeks):", 
    round(non_injury_lower, 2), "-", round(non_injury_upper, 2))
95% CI for mean pct_of_max_velocity (non-injury weeks): 84.65 - 86.83
# T-test to compare pct_of_max_velocity in injuries and non injuries that week
t_test <- t.test(
  pct_of_max_velocity ~ injury, 
  data = injuries_and_running_clean,
  var.equal = FALSE # use TRUE if you assume equal variances
)

# Print results
print(t_test)

    Welch Two Sample t-test

data:  pct_of_max_velocity by injury
t = -0.4103, df = 18.833, p-value = 0.6862
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -4.964421  3.337870
sample estimates:
mean in group 0 mean in group 1 
       85.73959        86.55287 

Group 1 (injury) mean = 86.55 Group 2 (non-injury) mean = 85.74 While the mean % of max velocity is higher for the injury group, the difference is small and not statistically significant. (high p-value, 0.69)

Plots of Percent of Max Velocity in that week

# Plot of % Max Velocity during injury and non injury weeks
ggplot(injuries_and_running_clean, aes(x = week, y = pct_of_max_velocity)) +
  geom_point(aes(color = factor(injury))) +
  scale_color_manual(values = c("0" = "black", "1" = "red"),
                     labels = c("No Injury", "Injury"),
                     name = "Injury Status") +
  labs(
    title = "Percent of Max Velocity by Week",
    subtitle = paste("Mean % of Max Velocity, Injury: ", round(injury_mean, 2), " |  Mean % of Max Velocity, Non-Injury: ", round(non_injury_mean, 2)),
    x = "Week",
    y = "% of Max Velocity Reached"
  ) +
  theme_classic()



ggplot(injury_weeks, aes(x = week, y = pct_of_max_velocity)) +
  geom_point(color = "red") +
  labs(
    title = "Percent of Max Velocity During Injury Weeks",
    subtitle = paste("% of Max Velocity Mean: ", round(injury_mean, 2)),
    x = "Week",
    y = "% of Max Velocity"
  ) +
  theme_classic()


# Create bar chart
ggplot(injury_weeks, aes(x = factor(injury_event), y = pct_of_max_velocity)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = round(pct_of_max_velocity, 1)),  # round to 1 decimal place
            vjust = -0.5, size = 2.75) +
  labs(
    title = "Percent of Max Velocity During Injury Weeks",
    subtitle = paste("Mean: ", round(injury_mean, 2)),
    x = "Injury Event",
    y = "% of Max Velocity"
  ) +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Plot of % Max Velocity during injury and non injury weeks
ggplot(week_prior, aes(x = week, y = lag1_pct_of_max_velocity)) +
  geom_point(aes(color = factor(injury))) +
  scale_color_manual(values = c("0" = "black", "1" = "red"),
                     labels = c("No Injury", "Injury"),
                     name = "Injury Status") +
  labs(
    title = "Percent of Max Velocity of Prior Week",
    subtitle = paste("Mean % of Max Velocity, Injury: ", round(injury_week_prior_mean, 2), " |  Mean % of Max Velocity, Non-Injury: ", round(non_injury_week_prior_mean, 2)),
    x = "Week",
    y = "% of Max Velocity Reached"
  ) +
  theme_classic()


ggplot(injury_week_prior, aes(x = week, y = lag1_pct_of_max_velocity)) +
  geom_point(color = "red") +
  labs(
    title = "Percent of Max Velocity the week before injury",
    subtitle = paste("% of Max Velocity Mean: ", round(injury_week_prior_mean, 2)),
    x = "Week",
    y = "% of Max Velocity"
  ) +
  theme_classic()


# Bar chart for week prior to injury % Max
ggplot(injury_week_prior, aes(x = factor(injury_event), y = lag1_pct_of_max_velocity)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = round(lag1_pct_of_max_velocity, 1)),  # round to 1 decimal place
            vjust = -0.5, size = 2.75) +
  labs(
    title = "Percent of Max Velocity the Week Before Injury",
    subtitle = paste("% Max Mean: ", round(injury_week_prior_mean, 2)),
    x = "Injury Event",
    y = "% of Max Velocity"
  ) +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# T-test to compare sum of current week, 1, 2, and 3 weeks before
t_test_sum_last3weeks <- t.test(
  sum_last3weeks ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_sum_last3weeks)

    Welch Two Sample t-test

data:  sum_last3weeks by injury
t = -0.83663, df = 16.234, p-value = 0.4149
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -15.360139   6.659672
sample estimates:
mean in group 0 mean in group 1 
       343.5134        347.8636 

Mean % of max week before: injury = 86.6, non-injury = 86.2. Not sig, pvalue = 0.87

Mean % change week before (current week % - last week %): injury = -0.057, non-injury = -0.50 Not sig, p-value = 0.89

Mean % change of last 2 weeks (current - 2 weeks ago): injury = -2.68, non-injury = -0.66 Not sig, p-value = 0.44

Mean % change of last 3 weeks (current - 3 weeks ago): injury = -0.60, non-injury = -0.81. Not sig, p-value = 0.95

Sum of current, 1, 2, 3 weeks before: injury = 347.86, non-injury = 343.51 Not sig, p-value = 0.41

# Look into the last two weeks

# Drop NAs
lag_2_injury <- injury_week_prior %>%
  drop_na(lag2_pct_of_max_velocity)
lag_2_non_injury <- non_injury_week_prior %>%
  drop_na(lag2_pct_of_max_velocity)

# Calculate means
mean_lag2_injury <- mean(lag_2_injury$lag2_pct_of_max_velocity)
mean_lag2_non_injury <- mean(lag_2_non_injury$lag2_pct_of_max_velocity)
mean_lag2_injury
mean_lag2_non_injury

# Two weeks before:
# T-test to compare lag2_pct_of_max_velocity (2 weeks before) between injury and non injury
t_test_2week_prior <- t.test(
  lag2_pct_of_max_velocity ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_2week_prior)

# Change of last two weeks (lag1 - lag2)
t_test_change_weeks1_2 <- t.test(
  change_weeks1_2 ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_change_weeks1_2)

# Sum of weeks 1 and 2
t_test_sum_weeks1_2 <- t.test(
  sum_weeks1_2 ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_sum_weeks1_2)


# 3 weeks before
# T-test to compare lag3_pct_of_max_velocity (3 weeks before) between injury and non injury
t_test_3week_prior <- t.test(
  lag3_pct_of_max_velocity ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_3week_prior)

# Change between 1 weeks before and 3 weeks before
t_test_change_weeks1_3 <- t.test(
  change_weeks1_3 ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_change_weeks1_3)

# Sum of 1, 2, and 3 weeks before injury
t_test_sum_weeks1_2_3 <- t.test(
  sum_weeks1_2_3 ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_sum_weeks1_2_3)

2 weeks before injury: Means of % of max: injury = 88.7, non-injury = 85.8 Not a significant difference in means, p-value, 0.1833 > 0.05.

Means of change: injury = -2.34, non-injury = 0.108 Not significant (p-value = 0.47)

Mean of sum of weeks 1 and 2: injury = 175.14, non-injury = 171.72 Not significant, p-value = 0.333

3 weeks before injury: Means of % of max: injury = 86.7, non-injury = 85.9 Not a significant difference in means, high p-value, 0.7

Mean sum of 1, 2, and 3 weeks before injury: injury = 261.8, non-injury = 257.3 Not sig, p-value = 0.36

Mean change between 1 and 3 weeks before: injury = -0.26, non-injury = -0.27 Not sig, p-val = 0.99

model <- glm(injury ~ change_lastweek + change_last2weeks,
             data = week_prior, family = "binomial")
summary(model)

Call:
glm(formula = injury ~ change_lastweek + change_last2weeks, family = "binomial", 
    data = week_prior)

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -2.99411    0.27524 -10.878   <2e-16 ***
change_lastweek    0.01958    0.02695   0.726    0.468    
change_last2weeks -0.02992    0.03072  -0.974    0.330    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 119.42  on 302  degrees of freedom
Residual deviance: 118.44  on 300  degrees of freedom
  (30 observations deleted due to missingness)
AIC: 124.44

Number of Fisher Scoring iterations: 6

There is no clear evidence that % of max velocity in the current or prior weeks (or their changes/sums) differs between injury and non-injury weeks, based on t-tests of mean differences.

Comparing the number of >90% efforts with injury
# Convert 90 percent max to logical
# Instead of Yes/No it is True/False
catapult_weekly <- Catapult_Session_clean %>%
  mutate(
    Date = as.Date(Date),
    week = floor_date(Date, unit = "week"),
    Hit90 = Hit.90.Percent.Max == "Yes"  # Convert to logical
  ) %>%
  distinct(anon_id, Date, .keep_all = TRUE)


# Get all combinations of player and week
all_combinations <- catapult_weekly %>%
  distinct(anon_id, week)

# Count weekly >90% sprint hits
weekly_hits <- catapult_weekly %>%
  group_by(anon_id, week) %>%
  summarise(
    count_90pct = sum(Hit90, na.rm = TRUE),
    .groups = "drop"
  )

# Join with all combinations to include zero counts
weekly_90pct <- all_combinations %>%
  left_join(weekly_hits, by = c("anon_id", "week")) %>%
  mutate(count_90pct = replace_na(count_90pct, 0))

# See breakdown of counts of # of times >90
distrubution <- weekly_90pct %>%
  count(count_90pct) %>%
  arrange(desc(n))
# Create injury weeks
injury_weeks <- Incident_Report_clean %>%
  mutate(
    week = floor_date(as.Date(Date.of.Injury), unit = "week")
  ) %>%
  filter(Date.of.Injury >= as.Date("2024-06-30") & Date.of.Injury <= as.Date("2025-07-01")) %>%
  select(anon_id, week) %>%
  distinct() %>%
  mutate(injury = 1)

# Merge with injury data to have column that indicates injury week
weekly_90pct_injuries <- weekly_90pct %>%
  left_join(injury_weeks, by = c("anon_id", "week")) %>%
  mutate(injury = replace_na(injury, 0))

# Calculate lagged >90% effort counts
weekly_90pct_injuries <- weekly_90pct_injuries %>%
  arrange(anon_id, week) %>%
  group_by(anon_id) %>%
  mutate(
    lag1_90pct_count = lag(count_90pct, 1),
    lag2_90pct_count = lag(count_90pct, 2),
    lag3_90pct_count = lag(count_90pct, 3),
    
    change_lastweek = count_90pct - lag1_90pct_count,
    change_last2weeks = count_90pct - lag2_90pct_count,
    change_last3weeks = count_90pct - lag3_90pct_count,
    
    sum_last2weeks = count_90pct + lag1_90pct_count,
    sum_last3weeks = count_90pct + lag1_90pct_count + lag2_90pct_count,
    
    avg_lastweek = (count_90pct + lag1_90pct_count) / 2,
    avg_last2weeks = (count_90pct + lag1_90pct_count + lag2_90pct_count) / 3,
    avg_last3weeks = (count_90pct + lag1_90pct_count + lag2_90pct_count + lag3_90pct_count) / 4
  ) %>%
  ungroup()
# Bar chart of sprint counts the week of an injury

# Only injuries
injury_summary <- weekly_90pct_injuries %>%
  group_by(count_90pct) %>%
  summarise(
    total_injuries = sum(injury),
    total_weeks = n()
  )

ggplot(injury_summary, aes(x = factor(count_90pct), y = total_injuries)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = total_injuries), vjust = -0.3, size = 4) +
  labs(
    x = "Weekly >90% Sprint Count",
    y = "Total Injuries",
    title = "Injuries by >90% Sprint Count in the Week of Injury"
  ) +
  theme_classic()

# Injury rate table for week of injury counts
injury_rate_table <- weekly_90pct_injuries %>%
  group_by(count_90pct) %>%
  summarise(
    total_weeks = n(),                           # Total weeks with this count
    injury_weeks = sum(injury == 1),             # How many had injury that week
    injury_rate = injury_weeks / total_weeks    # Proportion injured
  ) %>%
  arrange(desc(injury_rate))
injury_rate_table
# Create a contingency table manually
injury_table <- matrix(c(
  10, 1343,   # Low: injured, not injured
  6, 395,     # Moderate: injured, not injured
  0, 78      # High: injured, not injured
), nrow = 3, byrow = TRUE)

# Add row and column names for clarity
rownames(injury_table) <- c("Low", "Moderate", "High")
colnames(injury_table) <- c("Injured", "Not_Injured")

# Run Fisher's Exact Test
fisher_result <- fisher.test(injury_table)

# View the result
print(fisher_result)

Our p-value of 0.33 means that there is no statically significance association between our sprint load groups and injury. Injury events are pretty rare overall in our dataset, making it hard to detect subtle differences.

# Summarize injuries and total weeks by total sprint counts over last 2 weeks
injury_summary_2week <- weekly_90pct_injuries %>%
  mutate(sum_last2weeks = replace_na(sum_last2weeks, 0)) %>%
  group_by(sum_last2weeks) %>%
  summarise(
    total_injuries = sum(injury == 1),
    total_weeks = n(),
    injury_rate = (total_injuries / total_weeks) * 100
  ) %>%
  arrange(desc(injury_rate))
injury_summary_2week

# Plot total injuries by total sprint counts over last 2 weeks
ggplot(injury_summary_2week, aes(x = factor(sum_last2weeks), y = total_injuries)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = total_injuries), vjust = -0.3, size = 4) +
  labs(
    x = "Total >90% Sprint Counts Over Last 2 Weeks",
    y = "Total Injuries",
    title = "Injuries by Total >90% Sprint Counts Over the Last Two Weeks"
  ) +
  theme_classic()


# Plot injury rate
ggplot(injury_summary_2week, aes(x=factor(sum_last2weeks), y = injury_rate)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = round(injury_rate, 3)), vjust = -0.3, size = 4) +
  labs(
    x = "Total >90% Sprint Counts Over Last 2 Weeks",
    y = "Injury Rate",
    title = "Injurie Rate by Total >90% Sprint Counts Over the Last Two Weeks"
  ) +
  theme_classic()

Current week counts: Injuries increase slightly from 0-3 counts with 3 having the highest injury rate of 2.0%. 4 and 5 counts have no injuries but a low sample size.

Counts over last few weeks (Grouped): High counts had no injuries Moderate counts had the highest injury rate (1.52%) Low counts had a injury rate of 0.74%

Injury rates increased from low counts (<1) injury rate of 0.74% to moderate counts (1-2) injury rate of 1.52% then dropped to 0% for high counts (2+). However, high counts have a small sample size (78), so the rate may be unstable.

weekly_90pct_injuries <- weekly_90pct_injuries %>%
  mutate(workload_group = case_when(
    avg2_group < 0.5 ~ "Low (<0.5)",
    avg2_group >= 0.5 ~ "High (>0.5)",
    TRUE ~ NA_character_
  ))

injury_rate_grouped <- weekly_90pct_injuries %>%
  group_by(workload_group) %>%
  summarise(
    total_weeks = n(),
    injuries = sum(injury == 1, na.rm = TRUE),
    injury_rate = (injuries / total_weeks) * 100
  )
injury_rate_grouped
NA
NA

Average over the last three weeks. The injury rate is higher when athletes average over 0.5 over 90 percent max sprint counts.

# Make the table
contingency <- matrix(c(8, 716, 8, 1084), nrow = 2, byrow = TRUE)
colnames(contingency) <- c("Injury", "NoInjury")
rownames(contingency) <- c("High", "Low")

# Run the test
chisq.test(contingency)

    Pearson's Chi-squared test with Yates' continuity correction

data:  contingency
X-squared = 0.33061, df = 1, p-value = 0.5653
# Or for small expected counts:
fisher.test(contingency)

    Fisher's Exact Test for Count Data

data:  contingency
p-value = 0.4475
alternative hypothesis: true odds ratio is not equal to 1
95 percent confidence interval:
 0.4927592 4.6497160
sample estimates:
odds ratio 
   1.51357 

Since our p-value of 0.56>0.05, there is no statistically significant difference in injury rates between the High and Low >90 count groups. The chi-square test of the injury rates of average >90 counts of the last 3 weeks (either high > 0.5, or low < 0.5) showed no significant difference in injury rates between these groups.

It is important to note that the number of injury events observed in the dataset is relatively low. As a result, the statistical power to detect significant differences in injury rates between groups is limited. This low event count increases the likelihood of a Type II error (failing to detect a true effect), and thus, non-significant results should be interpreted with caution. Future studies with larger sample sizes or more injury occurrences are needed to better assess the relationship between workload and injury risk.

# T-tests:

# Current week
t.test(count_90pct ~ injury, data = weekly_90pct_injuries, var.equal = FALSE)

# Week before
t.test(lag1_90pct_count ~ injury, data = weekly_90pct_injuries, var.equal = FALSE)

# Change last week to this week
t.test(change_lastweek ~ injury, data = weekly_90pct_injuries, var.equal = FALSE)

# Change last two weeks
t.test(change_last2weeks ~ injury, data = weekly_90pct_injuries, var.equal = FALSE)

# Sum over last 2 weeks
t.test(sum_last2weeks ~ injury, data = weekly_90pct_injuries, var.equal = FALSE)
weekly_90pct_injuries <- weekly_90pct_injuries %>%
  mutate(
    sprint_exposure_bin = ifelse(count_90pct > 0, "Some", "None")
  )

table_exposure <- table(weekly_90pct_injuries$sprint_exposure_bin, weekly_90pct_injuries$injury)

# Fisher's Exact Test
fisher.test(table_exposure)
# Create top quartile exposure
cutoff <- quantile(weekly_90pct_injuries$count_90pct, 0.75, na.rm = TRUE)

weekly_90pct_injuries <- weekly_90pct_injuries %>%
  mutate(
    sprint_exposure_group = ifelse(count_90pct >= cutoff, "High", "Low")
  )

table_quartile <- table(weekly_90pct_injuries$sprint_exposure_group, weekly_90pct_injuries$injury)

fisher.test(table_quartile)
glm(injury ~ count_90pct, data = weekly_90pct_injuries, family = "binomial")

glm(injury ~ lag1_90pct_count, data = weekly_90pct_injuries, family = "binomial")

change <- glm(injury ~ change_lastweek, data = weekly_90pct_injuries, family = "binomial")
summary(change)
glm(injury ~ change_last2weeks, data = weekly_90pct_injuries, family = "binomial")


sum <- glm(injury ~ sum_last3weeks, data = weekly_90pct_injuries, family = "binomial")
summary(sum)
multi_model <- glm(
  injury ~ count_90pct + sum_last2weeks,
  data = weekly_90pct_injuries,
  family = "binomial"
)

summary(multi_model)
ggplot(weekly_90pct_injuries, aes(x = change_lastweek, y = injury)) +
  geom_jitter(height = 0.05, alpha = 0.3, color = "blue") +  # Jitter to spread points vertically
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = TRUE, color = "red") +  # Logistic regression curve
  labs(
    title = "Probability of Injury vs Change in >90% Sprint Counts (Last Week)",
    x = "Change in >90% Sprint Counts (Current Week - Previous Week)",
    y = "Injury (0 = No, 1 = Yes)"
  ) +
  theme_minimal()

No t-tests or models revealed any insights.

#removing extra data
remove(band_long, bigs, clean_anons, combo, cor_data, cor_matrix, daily_90_counts,
       full_grid, hit_90_counts, ID_11, Incident_dates, injured_data, injuries_and_running,
       injury_by_group, injury_weeks, max_velocities, model_all_bands, model_big,
       model_bigs, model_combo, model_skill, player_counts_long, player_data,
       player_positions, plot_data, plots_by_position, position_averages,
       position_averages_with_team, pre_injury_weeks, QBs, relative_bands,
       relative_bands_long, skill, sprint_counts, sprint_exposure_big,
       sprint_exposure_combo, sprint_exposure_skill, sprint_injury_table, V8, 
       weekly_90_counts, weekly_band_effort_by_group, weekly_effort, weekly_effort_long,
       weekly_effort_wide, weekly_max_pct, weekly_max_velocity, weekly_sprint_counts,
       weekly_sprint_counts_lagged, weekly_sprint_exposure, weekly_velocity_efforts)

#removing extra values and functions
remove(all_athletes, all_weeks, band_levels, BIG, COMBO, SKILL, injured_ids,
       injury_week_nums, overall_avg, player_id, positions, Positions, qb_avg, team_avg,
       plot_hit_90_by_position)

Section 2: Running Imbalance

What is the variation at the team level and at each individual athlete level?

Looking at team data as a whole, since January 1, 2024 there is absolute no deviance from 0. That means that since January 1, 2024, the team has had the same average running imbalance of 0. This makes sense given that the team is so large and that imbalances can go from -100% to 100%. This suggests that throughout this time, at no point was there a team sway to one side. There also weren’t any points since January 1, 2024 that the team had any large spikes in average absolute value of running imbalance. This suggests that at no point throughout the season were there larger spikes than normal in running imbalance. Each player tends to have a very unique trend in their running imbalance. Looking at how the team varies but also at how each player varies throughout the season, it’s hard to make out any pattern that’s applicable to most people. The variance of running imbalance varies greatly between each player. Instead we looked at the variances between players who were injured and those who were not. Based on three different bootstrapped findings, we can see that the variances between players who were injured and those who were not were statistically significant.

For the first bootstrap, we compared the variance of the pooled groups meaning that the variance in running imbalance for players who were injured and those who were not were compared. This resulted in a 90% confidence interval which suggested that the difference in variance between the two groups is between 0.30 and 3.45. This suggests that when looking at the variance of the two groups separately but all the players are pooled together, the variances will most likely be different by factor between 0.30 and 3.45 and the variance for the injured pool will be greater than that of the uninjured pool. For the second bootstrap, each player’s variance was taken individually. This unpooled approach was taken to see if an individual player’s variance in running imbalance could potentially be related to HSI risk. The bootstrap algorithm in this case took the averaged variances of the bootstrapped sample for each group and compared them. This bootstrap produced a 90% confidence interval for the difference in average variance between the two groups is 0.79 to 1.32. These results suggest that players who sustained a hamstring injury since January 1, 2024 had, on average, a greater variance in their running imbalance by about 1.06. This suggests that there is a relationship between variance in running imbalance and HSI risk. This found increase in variability will be used to address the following questions. For the third bootstrap, we wanted to see if there was a difference in the average mean absolute value in running imbalance between players who were and weren’t injured. The bootstrapping algorithm for this test calculated the average absolute distance value or each running imbalance measurement and found the average for each sample. This test found that at the 90% significance level, injured players had an average running imbalance absolute value between 0.06 and 0.32 greater than their uninjured counterparts. These values though, when we consider that the range of running imbalance goes from 0 to 100 is small and may be hard to detect when out in the field.

We also looked at the relationship between running imbalance and higher level position. From this analysis we found that there doesn’t seem to be a super strong relationship between the three categories and average running imbalance variance. The bootstrap revealed that there are potentially significant differences between those who are Bigs and Combos. But, those who are Skills weren’t able to differentiate themselves between the two groups. Along with this, we looked at the average absolute value in running imbalance for the three groups. This analysis told us that while Combos and Bigs tended to have the same average absolute value running imbalance, Skills had a significantly higher average absolute running imbalance value. We can see from the very last chart in this analysis that Skills make up the most of those with hamstring injuries followed closely by Combos and Bigs making up around half of the amount of Skills. This is interesting considering that the amounts of Bigs, Skills, and Combos within the Historical Running data set are all roughly the same.

Team Analysis

#team variation
Historical_Running_clean %>%
  summarize(Team_Variation = var(Running.Imbalance))

#individual player variation
Historical_Running_clean %>%
  group_by(anon_id) %>%
  summarize(Player_Variation = var(na.omit(Running.Imbalance))) %>%
  ungroup() 

#average variance in running imbalance across all players
Historical_Running_clean %>%
  group_by(anon_id) %>%
  mutate(Player.var = var(na.omit(Running.Imbalance))) %>%
  ungroup() %>%
  summarize(Average_Player_Variance = mean(na.omit(Player.var)))

#making variance and average absolute value for each date to see trends
Historical_Running_clean <- Historical_Running_clean %>%
  group_by(Date) %>%
  mutate(Date.Variance = var(na.omit(Running.Imbalance)),
         Date.Avg.Abs.Value = mean(abs(na.omit(Running.Imbalance)))) %>%
  ungroup()
#calculating mean and variance for team data
team_mean <- mean(Historical_Running_clean$Running.Imbalance)
team_sd <- sd(Historical_Running_clean$Running.Imbalance)

#making scatter plot of team running imbalance data throughout season
ggplot(Historical_Running_clean, aes(Date, Running.Imbalance)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = team_mean, color = "#CFB87C") +
  geom_hline(yintercept = team_mean + team_sd) +
  geom_hline(yintercept = team_mean - team_sd) +
  geom_hline(yintercept = team_mean + (2*team_sd), color = "#A2A4A3") +
  geom_hline(yintercept = team_mean - (2*team_sd), color = "#A2A4A3") +
  geom_smooth(method = "lm", se = TRUE, color = "#CFB87C") +
  labs(title = "Team Running Imbalance Since January 1, 2024", y="Running Imbalance (%)",
       subtitle = "\u03BC = 0.08623412, \u03C3^2 = 14.94215") +
  theme_minimal()

#making scatter plot of team running imbalance data throughout season
ggplot(Historical_Running_clean, aes(Date, Running.Imbalance)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = team_mean, color = "#CFB87C") +
  geom_smooth(method = "lm", se = TRUE, color = "#CFB87C") +
  geom_line(aes(x=Date, y=Date.Avg.Abs.Value), color = "#CFB87C") +
  geom_line(aes(x=Date, y=-Date.Avg.Abs.Value), color = "#CFB87C") +
  labs(title = "Team Running Imbalance Since January 1, 2024", y="Running Imbalance (%)",
       subtitle = "\u03BC = 0.08623412, \u03C3^2 = 14.94215") +
  theme_minimal()

#making histogram of team running imbalance data
ggplot(Historical_Running_clean, aes(Running.Imbalance)) +
  geom_histogram() +
  labs(title = "Team Running Imbalance Since January 1, 2024", x="Running Imbalance") +
  theme_minimal()

Injured Analysis

#making histogram for running imbalance of all injured athletes
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Running.Imbalance)) +
  geom_histogram(fill = "#CFB87C", alpha = 0.75) +
  #adding in 95% confidence interval
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.025), color = "#CFB87C") +
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.975), color = "#CFB87C") +
  xlim(-21,21) +
  labs(title = "Running Imbalance for Players with HSI since January 1, 2024") +
  theme_minimal()

#making histogram for running imbalance of all uninjured athletes
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Running.Imbalance)) +
  geom_histogram(fill = "black", alpha = 0.75) +
  #adding in 95% confidence interval
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.025)) +
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.975)) +
  xlim(-21,21) +
  labs(title = "Running Imbalance for Players without HSI since January 1, 2024") +
  theme_minimal()

#Plotting injured and uninjured histograms over top one another
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Running.Imbalance)) +
  geom_histogram(alpha = 0.75) +
  #adding in 95% CI for uninjured players
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.05)) +
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.95)) +
  geom_histogram(data = Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Running.Imbalance), fill = "#CFB87C", alpha = 0.75) +
  #adding in 95% CI for injured players
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.05), color = "#CFB87C") +
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.95), color = "#CFB87C") +
  xlim(-21,21) +
  labs(title = "Running Imbalance for Players with and without HSI since January 1, 2024") +
  theme_minimal()
#making scatter plot of running imbalance data for injured players
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Date, Running.Imbalance)) + 
  geom_point(alpha = 0.3) +
  ylim(-21,21) +
  labs(title = "Running Imbalance for Players with HSI since January 1, 2024") +
  theme_minimal()

#making scatter plot of running imbalance for uninjured players
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Date, Running.Imbalance)) + 
  geom_point(alpha = 0.3) +
  ylim(-21,21) +
  labs(title = "Running Imbalance for Players without HSI since January 1, 2024") +
  theme_minimal()

Bootstrapping Differences between Injured and Uninjured Athletes

#Splitting up the data sets and calculating player variance and measurement absolute value
injured_data <- Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,] %>%
  mutate(Player.Absolute.Dist = abs(Running.Imbalance)) %>%
  group_by(anon_id) %>%
  mutate(Player.Variance = var(Running.Imbalance)) %>%
  ungroup()

uninjured_data <- Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,] %>%
  mutate(Player.Absolute.Dist = abs(Running.Imbalance)) %>%
  group_by(anon_id) %>%
  mutate(Player.Variance = var(Running.Imbalance)) %>%
  ungroup()
#making a data frame to hold all of the within group variances
group_variances <- data.frame(injured_var = rep(NA,5000),
                uninjured_var = rep(NA,5000),
                diff_in_var = rep(NA, 5000))

#bootstrap for variance, 5000 iterations
for(i in 1:5000){
  #random seed
  set.seed(i) 
  
  #taking samples from each of the data sets, same number of rows, replacement true
  injured_sample <- sample_n(injured_data, replace = TRUE, size = 1672)
  uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2391)
  
  #storing the calculated variances in data frame
  group_variances[i,1] = var(injured_sample$Running.Imbalance)
  group_variances[i,2] = var(uninjured_sample$Running.Imbalance)
  group_variances[i,3] = group_variances[i,1] - group_variances[i,2]
}
ggplot(data=group_variances, aes(diff_in_var)) +
  geom_histogram() +
  geom_vline(xintercept = quantile(group_variances$diff_in_var, 0.05), color= "#CFB87C") +
  geom_vline(xintercept = quantile(group_variances$diff_in_var, 0.95), color= "#CFB87C") +
  labs(x="Difference in Variance") +
  theme_minimal()

ggplot(data=group_variances) +
  geom_histogram(aes(injured_var), alpha = 0.75, fill ="#CFB87C") +
  geom_histogram(aes(uninjured_var), alpha = 0.75) +
  labs(x="Variance") +
  theme_minimal()
#making a data frame to hold all of the average player variances between groups
mean_player_variances <- data.frame(injured_var = rep(NA,5000),
                uninjured_var = rep(NA,5000),
                diff_in_var = rep(NA, 5000))

for(i in 1:5000){
  #random seed
  set.seed(i) 
  
  #taking samples from each of the data sets, same number of rows, replacement true
  injured_sample <- sample_n(injured_data, replace = TRUE, size = 1672)
  uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2391)
  
  #storing the calculated variances in data frame
  mean_player_variances[i,1] = mean(na.omit(injured_sample$Player.Variance))
  mean_player_variances[i,2] = mean(na.omit(uninjured_sample$Player.Variance))
  mean_player_variances[i,3] = mean_player_variances[i,1] - mean_player_variances[i,2]
}
ggplot(data = mean_player_variances, aes(diff_in_var)) +
  geom_histogram() +
  geom_vline(xintercept = quantile(mean_player_variances$diff_in_var, 0.05), color= "#CFB87C") +
  geom_vline(xintercept = quantile(mean_player_variances$diff_in_var, 0.95), color= "#CFB87C") +
  labs(title = "Difference in Estimated Average Variance for Injured and Uninjured Athletes", x="Difference in Average Variance") +
  theme_minimal()

ggplot(data=mean_player_variances) +
  geom_histogram(aes(injured_var), alpha = 0.75, fill ="#CFB87C") +
  geom_histogram(aes(uninjured_var), alpha = 0.75) +
  labs(title = "Estimated Average Variance for Injured and Uninjured Athletes", x="Average Variance") +
  theme_minimal()
#making a data frame to hold all of the average absolute differences from 0
group_distance <- data.frame(injured_dist = rep(NA,5000),
                uninjured_dist = rep(NA,5000),
                diff_in_dist = rep(NA, 5000))

#bootstrap for variances, 5000 iterations
for(i in 1:5000){
  #random seed
  set.seed(i) 
  
  #taking samples from each of the data sets, same number of rows, replacement true
  injured_sample <- sample_n(injured_data, replace = TRUE, size = 1658)
  uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2405)
  
  #storing the calculated variances in data frame
  group_distance[i,1] = mean(injured_sample$Player.Absolute.Dist)
  group_distance[i,2] = mean(uninjured_sample$Player.Absolute.Dist)
  group_distance[i,3] = group_distance[i,1] - group_distance[i,2]
}
ggplot(data = group_distance, aes(diff_in_dist)) +
  geom_histogram() +
  geom_vline(xintercept = quantile(group_distance$diff_in_dist, 0.05), color= "#CFB87C") +
  geom_vline(xintercept = quantile(group_distance$diff_in_dist, 0.95), color= "#CFB87C") +
  labs(title = "Estimated Difference in Average Absolute Value", x="Difference in Average Absolute Value") +
  theme_minimal()

ggplot(data = group_distance) +
  geom_histogram(aes(injured_dist), alpha = 0.75, fill ="#CFB87C") +
  geom_histogram(aes(uninjured_dist), alpha = 0.75) +
  labs(title = "Estimated Average Absolute Value", x="Average Absolute Value")  +
  theme_minimal()
#removing junk that came from the loops
remove(i,team_mean, team_sd, injured_sample, uninjured_sample, group_variances, injured_data, uninjured_data, mean_player_variances, group_distance)

Position Analysis

#making lists to sort position into larger categories
COMBO <- c("QB","LB","TE","RB", "ILB")
BIG <- c("OL", "DL", "DE", "DT")
SKILL <- c("WR", "DB", "CB", "SAF")
Positions <- c("COMBO", "BIG", "SKILL")

#giving positions to incident report
Incident_Report_clean <- Incident_Report_clean %>%
  mutate(Specific.Position = Position,
         Position = case_when(Specific.Position %in% COMBO ~ "COMBO",
                               Specific.Position %in% BIG ~ "BIG",
                               Specific.Position %in% SKILL ~ "SKILL"))

#giving positions to catapult session
Catapult_Session_clean <- Catapult_Session_clean %>%
  mutate(Specific.Position = Primary.Position,
         Position = case_when(Primary.Position %in% COMBO ~ "COMBO",
                               Primary.Position %in% BIG ~ "BIG",
                               Primary.Position %in% SKILL ~ "SKILL"))

#only taking IDs and position names and categories
incident_info <- Incident_Report_clean[,c("anon_id", "Position", "Specific.Position")]
catapult_info <- Catapult_Session_clean[,c("anon_id", "Position", "Specific.Position")]

#comprehensive list of IDs, their position, and category
info <- distinct(rbind(incident_info, catapult_info))

#add this only historical running
Historical_Running_clean <- left_join(Historical_Running_clean, info, by="anon_id",
                                      relationship="many-to-many") %>%
  #calculating each player's variance in running imbalance
  group_by(anon_id) %>%
  mutate(Player.Variance = var(Running.Imbalance)) %>%
  ungroup()
for(i in 1:3){
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$Position==Positions[i],],aes(Running.Imbalance)) +
    geom_histogram() +
    labs(subtitle=Positions[i])
  
  print(p)
}
Player_Summary_Stats <- distinct(Historical_Running_clean[,c("anon_id", "Position",
                                                 "Specific.Position", 
                                                 "Player.Variance")])
for(i in 1:3){
  p <- ggplot(data=Player_Summary_Stats[Player_Summary_Stats$Position==Positions[i],],
              aes(Player.Variance)) +
    geom_histogram() +
    labs(subtitle=Positions[i])
  
  print(p)
}

Bootstrapping Different Positions

#splitting up data set into the different categories
COMBOS <- Player_Summary_Stats %>%
  filter(Position == "COMBO") %>%
  na.omit()

SKILLS <- Player_Summary_Stats %>%
  filter(Position == "SKILL") %>%
  na.omit()

BIGS <- Player_Summary_Stats %>%
  filter(Position == "BIG") %>%
  na.omit()

group_avg_variance <- data.frame(COMBO_var = rep(NA, 5000),
                                 SKILL_var = rep(NA, 5000),
                                 BIG_var = rep(NA, 5000))
for(i in 1:5000){
  set.seed(i)
  combo_sample <- sample_n(COMBOS, size=20, replace=TRUE)
  skill_sample <- sample_n(SKILLS, size=22, replace=TRUE)
  big_sample <- sample_n(BIGS, size=24, replace=TRUE)
  
  group_avg_variance[i,1] <- mean(na.omit(combo_sample$Player.Variance))
  group_avg_variance[i,2] <- mean(na.omit(skill_sample$Player.Variance))
  group_avg_variance[i,3] <- mean(na.omit(big_sample$Player.Variance))
}
ggplot(data=group_avg_variance, aes(COMBO_var)) +
  geom_histogram() +
  geom_vline(xintercept = quantile(group_avg_variance$COMBO_var, 0.05)) +
  geom_vline(xintercept = quantile(group_avg_variance$COMBO_var, 0.95)) +
  theme_minimal()

ggplot(data=group_avg_variance, aes(SKILL_var)) +
  geom_histogram(fill="#CFB87C") +
  geom_vline(xintercept = quantile(group_avg_variance$SKILL_var, 0.05), color="#CFB87C") +
  geom_vline(xintercept = quantile(group_avg_variance$SKILL_var, 0.95), color="#CFB87C") +
  theme_minimal()

ggplot(data=group_avg_variance, aes(BIG_var)) +
  geom_histogram(fill="#A2A4A3") +
  geom_vline(xintercept = quantile(group_avg_variance$BIG_var, 0.05), color="#A2A4A3") +
  geom_vline(xintercept = quantile(group_avg_variance$BIG_var, 0.95), color="#A2A4A3") +
  theme_minimal()

ggplot(data=group_avg_variance) +
  geom_histogram(aes(COMBO_var), alpha=0.5, fill="black") +
  geom_histogram(aes(SKILL_var), alpha=0.5, fill="#CFB87C") +
  geom_histogram(aes(BIG_var), alpha=0.5, fill="#A2A4A3") +
  labs(title="Estimated Average Variance in Running Imbalance", x="Average Variance in Running Imbalance") +
  theme_minimal()
var_confints <- data.frame(Category = c("COMBO", "SKILL", "BIG"),
                       Lower_Bound = c(quantile(group_avg_variance$COMBO_var, 0.05), 
                                       quantile(group_avg_variance$SKILL_var, 0.05), 
                                       quantile(group_avg_variance$BIG_var, 0.05)),
                       Median = c(quantile(group_avg_variance$COMBO_var, 0.5),
                                  quantile(group_avg_variance$SKILL_var, 0.5),
                                  quantile(group_avg_variance$BIG_var, 0.5)),
                       Upper_Bound = c(quantile(group_avg_variance$COMBO_var, 0.95),
                                       quantile(group_avg_variance$SKILL_var, 0.95),
                                       quantile(group_avg_variance$BIG_var, 0.95)))
head(var_confints)

ggplot(data=var_confints, aes(x=Category, y=Median, ymin=Lower_Bound, ymax=Upper_Bound, color=Category)) +
  geom_pointrange() +
  labs(y="Estimated Average Variance in Running Imbalance", title="Running Imbalance Variance 90% CI") +
  scale_color_manual(values = c("COMBO" = "black", "SKILL" = "#CFB87C", "BIG" = "#A2A4A3")) +
  theme_minimal()
Historical_Running_clean <- Historical_Running_clean %>%
  mutate(Abs.Value.Running.Imbalance = abs(Running.Imbalance))

COMBOS <- Historical_Running_clean %>%
  filter(Position == "COMBO")

SKILLS <- Historical_Running_clean %>%
  filter(Position == "SKILL")

BIGS <- Historical_Running_clean %>%
  filter(Position == "BIG")

group_avg_dist <- data.frame(COMBO_dist = rep(NA, 5000),
                             SKILL_dist = rep(NA, 5000),
                             BIG_dist = rep(NA, 5000))
for(i in 1:5000){
  set.seed(i)
  combo_sample <- sample_n(COMBOS, size=1203, replace=TRUE)
  skill_sample <- sample_n(SKILLS, size=1635, replace=TRUE)
  big_sample <- sample_n(BIGS, size=1224, replace=TRUE)
  
  group_avg_dist[i,1] <- mean(na.omit(combo_sample$Abs.Value.Running.Imbalance))
  group_avg_dist[i,2] <- mean(na.omit(skill_sample$Abs.Value.Running.Imbalance))
  group_avg_dist[i,3] <- mean(na.omit(big_sample$Abs.Value.Running.Imbalance))
}
ggplot(data=group_avg_dist) +
  geom_histogram(aes(COMBO_dist), alpha=0.5, fill="black") +
  geom_histogram(aes(SKILL_dist), alpha=0.5, fill="#CFB87C") +
  geom_histogram(aes(BIG_dist), alpha=0.5, fill="#A2A4A3") +
  labs(title = "Estimated Average Absolute Value Running Imbalance", x="Average Absolute Value Running Imbalance") +
  theme_minimal()
dist_confints <- data.frame(Category = c("COMBO", "SKILL", "BIG"),
                       Lower_Bound = c(quantile(group_avg_dist$COMBO_dist, 0.05), 
                                       quantile(group_avg_dist$SKILL_dist, 0.05), 
                                       quantile(group_avg_dist$BIG_dist, 0.05)),
                       Median = c(quantile(group_avg_dist$COMBO_dist, 0.5),
                                  quantile(group_avg_dist$SKILL_dist, 0.5),
                                  quantile(group_avg_dist$BIG_dist, 0.5)),
                       Upper_Bound = c(quantile(group_avg_dist$COMBO_dist, 0.95),
                                       quantile(group_avg_dist$SKILL_dist, 0.95),
                                       quantile(group_avg_dist$BIG_dist, 0.95)))
head(dist_confints)

ggplot(data=dist_confints, aes(x=Category, y=Median, ymin=Lower_Bound, ymax=Upper_Bound, color=Category)) +
  geom_pointrange() +
  scale_color_manual(values = c("COMBO" = "black", "SKILL" = "#CFB87C", "BIG" = "#A2A4A3")) +
  labs(title="Average Absolute Value Running Imbalance 90% CI", y="Estimated Average Absolute Value Running Imbalance") +
  theme_minimal()
Injury_Incidents <- distinct(Incident_Report_clean[,c("anon_id", "Position", "Date.of.Injury")])

Position_counts <- na.omit(distinct(Historical_Running_clean[,c("anon_id", "Position")]))

ggplot(Injury_Incidents, aes(Position, fill=Position)) +
  geom_bar() +
  scale_fill_manual(values = c("COMBO" = "black", "SKILL" = "#CFB87C", "BIG" = "#A2A4A3")) +
  labs(title="Hamstring Injuries by Position") +
  theme_minimal()

ggplot(Position_counts, aes(Position, fill=Position)) +
  geom_bar() +
  scale_fill_manual(values = c("COMBO" = "black", "SKILL" = "#CFB87C", "BIG" = "#A2A4A3")) +
  labs(title="Total Position Counts on Team") +
  theme_minimal()
remove(big_sample, BIGS, catapult_info, combo_sample, COMBOS, confints, group_avg_variance,
       incident_info, info, p, Player_Summary_Stats, skill_sample, SKILLS, BIG, COMBO, i,
       Positions, SKILL, dist_confints, group_avg_variance, group_avg_dist, group_distance, 
       group_variances, injured_data, injured_sample, Injury_Incidents, mean_player_variances, 
       Position_counts, var_confints)

What is a meaningful change? What red flags should go off when we see a week-to-week change in running imbalance?

Based on the analysis below, there doesn’t seem to be any major discernible differences in running imbalance before or following an injury. This suggests that there may not be a direct link between HSI risk and running imbalance value directly beforehand. Instead, the trends of many players who were injured seems to have a relatively consistent trend not entirely dependent on time. Looking at summary statistics of running imbalance in the weeks leading up to and following a hamstring injury, there are also no glaring trends. For this analysis, we looked at the mean and variance in running imbalance per week leading up to and after an injury for all of the injured players with running imbalance data. This showed us that there is no clear indicator of HSI risk in running imbalance or any summary statistic of it. Instead it may be more useful to look at each player’s total running imbalance and their individual variance. This seems to be more of a useful tool for differentiating between injured and uninjured athletes.

#getting running imbalances for just injured players
Injured_Historical_Running <- Historical_Running_clean %>%
  filter(anon_id %in% injured_IDs)

#making new column to represent when in time injury would be, negative means before injury and positive means after injury, 0 means date of injury if there's data for that day
Injured_Historical_Running$Weeks.After.Injury <- rep(NA, 1658)
Injured_Historical_Running$Injury.Count <- rep(NA, 1658)

#making new column in incident report for the injury count
Incident_Report_clean$Injury.Count <- rep(NA, 122)

#go through all of the injured players in the data set
for(i in 1:22){
  #get the dates each player was injured
  injury_dates <- unique(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)
  
  #go through all of the dates in which the player had an injury
  for(j in 1:length(injury_dates)){
    
    #calculate dates for 1, 2, 3, 4 weeks before and after each injury date
    past_1 <- injury_dates[j]-7
    past_2 <- injury_dates[j]-14
    past_3 <- injury_dates[j]-21
    past_4 <- injury_dates[j]-28
    future_1 <- injury_dates[j]+7
    future_2 <- injury_dates[j]+14
    future_3 <- injury_dates[j]+21
    future_4 <- injury_dates[j]+28
    
    #Calculating how many injuries this is for the player
    injury_count <- as.character((length(injury_dates)) - j + 1)
    
    #compare date of data point for each player to date of injury, store in Weeks.After.Injury column, store injury count
    
    #first week after injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > injury_dates[j] & 
                                 Injured_Historical_Running$Date<=future_1,]$Weeks.After.Injury <- "1"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > injury_dates[j] & 
                                 Injured_Historical_Running$Date<=future_1,]$Injury.Count <- injury_count
    
    #second week after injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_1 & 
                                 Injured_Historical_Running$Date<=future_2,]$Weeks.After.Injury <- "2"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_1 & 
                                 Injured_Historical_Running$Date<=future_2,]$Injury.Count <- injury_count
    
    #third week after injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_2 & 
                                 Injured_Historical_Running$Date<=future_3,]$Weeks.After.Injury <- "3"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_2 & 
                                 Injured_Historical_Running$Date<=future_3,]$Injury.Count <- injury_count
    
    #fourth week after injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_3 & 
                                 Injured_Historical_Running$Date<=future_4,]$Weeks.After.Injury <- "4"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_3 & 
                                 Injured_Historical_Running$Date<=future_4,]$Injury.Count <- injury_count    
    #week right before injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < injury_dates[j] & 
                                 Injured_Historical_Running$Date>=past_1,]$Weeks.After.Injury <- "-1"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < injury_dates[j] & 
                                 Injured_Historical_Running$Date>=past_1,]$Injury.Count <- injury_count
    #two weeks before injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_1 & 
                                 Injured_Historical_Running$Date>=past_2,]$Weeks.After.Injury <- "-2"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_1 & 
                                 Injured_Historical_Running$Date>=past_2,]$Injury.Count <- injury_count
    
    #three weeks before injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_2 & 
                                 Injured_Historical_Running$Date>=past_3,]$Weeks.After.Injury <- "-3"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_2 & 
                                 Injured_Historical_Running$Date>=past_3,]$Injury.Count <- injury_count
        
    #four weeks before injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_3 & 
                                 Injured_Historical_Running$Date>=past_4,]$Weeks.After.Injury <- "-4"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_3 & 
                                 Injured_Historical_Running$Date>=past_4,]$Injury.Count <- injury_count
    
    #Date of Injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date == injury_dates[j],]$Weeks.After.Injury <- "0"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date == injury_dates[j],]$Injury.Count <- injury_count
    
    
    #adding injury count to indicent report
    Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i] & Incident_Report_clean$Date.of.Injury==injury_dates[j],]$Injury.Count <- injury_count
    
  }
}


#making weeks after injury and injury count into a factor and combining data sets back together
Injured_Historical_Running <- Injured_Historical_Running %>%
  mutate(Weeks.After.Injury = factor(Weeks.After.Injury),
         Injury.Count = factor(Injury.Count))

Historical_Running_clean <- left_join(Historical_Running_clean, Injured_Historical_Running)

#getting rid of junk that was from the loop
remove(future_1, future_2, future_3, future_4, i, injury_count, injury_dates, j, past_1, past_2, past_3, past_4)
remove(Injured_Historical_Running)

Injury Risk

for(i in 1:22){
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id==injured_IDs[i],], aes(Date, Running.Imbalance)) +
    geom_line(linetype=1) +
    geom_point(aes(color=Weeks.After.Injury)) +
    geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury, linetype=2) +
    scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue")) +
    theme_minimal() +
    labs(title="Running Imbalance", subtitle = injured_IDs[i])
  
  print(p)
}
#making summary statistics of running imbalance per week relative to injury
Historical_Running_clean <- Historical_Running_clean %>%
  group_by(anon_id, Injury.Count, Weeks.After.Injury) %>%
  mutate(Weeks.After.Injury.Variability = var(Running.Imbalance),
         Weeks.After.Injury.Mean = mean(abs(Running.Imbalance))) %>%
  ungroup()
for(i in 1:22){
  #looking at mean running imbalance per week before and after injury for each injured player
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Weeks.After.Injury.Mean, group=Injury.Count)) +
  geom_line() +
  geom_point(aes(color=Weeks.After.Injury)) +
    xlim(min(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)-30, max(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)+30) +
  geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury) +
    labs(title="Average Absolute Distance per Week",injured_IDs[i]) +
    scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue"))
  
  print(p)
}
for(i in 1:22){
  #looking at variance in running imbalance per week before and after injury for each injured player
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Weeks.After.Injury.Variability, group=Injury.Count)) +
  geom_line() +
  geom_point(aes(color=Weeks.After.Injury)) +
    xlim(min(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)-30, max(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)+30) +
  geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury) +
    labs(title="Variance in Running Imbalance per Week", subtitle=injured_IDs[i]) +
    scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue"))
  
  print(p)
}

Detectable Trend with Injury Risk

trends <- data.frame(ID = all_IDs,
                     KC = rep(0, 71))
for(i in 1:71){
  #only plotting if there are over 15 data points for each player
  if(nrow(Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],])>15){
    #calculating how strong of a non linear trend there is using a gam
    df <- summary(gam(Running.Imbalance~s(Days.Since.Start),
              data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]))$edf
    #calculating how strong of a linear trend there is using kendall correlation
    Kendall_Cor <- cor(x=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Days.Since.Start, y=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Running.Imbalance, method="kendall")
    
    trends[trends$ID == all_IDs[i],2] = Kendall_Cor
  
    if(all_IDs[i] %in% injured_IDs){
      p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],],
             aes(Date, Running.Imbalance)) +
        geom_point(color="skyblue") +
        geom_line(color="skyblue") +
        labs(subtitle=Kendall_Cor)
    }
    if(!(all_IDs[i] %in% injured_IDs)){
      p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],],
             aes(Date, Running.Imbalance)) +
        geom_point() +
        geom_line() +
        labs(subtitle=Kendall_Cor)
    }
    
    print(p)
  }
}

#plotted only from January 1, 2025 to May 1, 2025
trends <- trends %>%
  mutate(injured = ifelse(ID %in% injured_IDs,1,0))

ggplot(trends[trends$KC>0 & trends$injured==1,], aes(KC)) +
  geom_histogram()

ggplot(trends[trends$KC>0 & trends$injured==0,], aes(KC)) +
  geom_histogram()
injured_trends <- trends %>%
  filter(injured==1)

uninjured_trends <- trends %>%
  filter(injured==0)


kendall_cors <- data.frame(injured_avg_KC = rep(NA,5000),
                           uninjured_avg_KC = rep(NA,5000),
                           avg_diff_KC = rep(NA,5000))

for(i in 1:5000){
  set.seed(i)
  injured_samp <- sample_n(injured_trends, size=22, replace=TRUE)
  uninjured_samp <- sample_n(uninjured_trends, size=49, replace=TRUE)
  
  kendall_cors[i,1] <- mean(abs(injured_samp$KC))
  kendall_cors[i,2] <- mean(abs(uninjured_samp$KC))
  kendall_cors[i,3] <- kendall_cors[i,1] - kendall_cors[i,2]
}
ggplot(data=kendall_cors, aes(avg_diff_KC)) +
  geom_histogram() +
  geom_vline(xintercept = quantile(kendall_cors$avg_diff_KC, 0.05), color = "#CFB87C") +
  geom_vline(xintercept = quantile(kendall_cors$avg_diff_KC, 0.95), color = "#CFB87C") +
  labs(title = "Estimated Difference in Average Kendall Rank Correlation Coefficient",
       x = "Difference in Average Kendall Coefficient")

ggplot(data=kendall_cors) +
  geom_histogram(aes(injured_avg_KC), alpha=0.7, fill="#CFB87C") +
  geom_histogram(aes(uninjured_avg_KC), alpha=0.7) +
  labs(title = "Estimated Average Kendall Rank Correlation Coefficient",
       x = "Average Kendall Coefficient")
remove(i, p, df, Kendall_Cor, kendall_cors, trends)

Is running imbalance sensitive enough of a metric to use as a prognosis tool versus a rehab tool?

Based on the analysis below, we can see that by solely using variance in running imbalance from the time of the injury to the end of the predicted return-to-play range is not a very strong predictor for whether or not it will take longer for a player to recover or not. In this analysis, we used the running imbalance in the time frame starting with the injury date to the end of the prognosis time frame. With these specific running imbalance values, we calculated the variance in running imbalance and whether or not the athlete returned to play within the predicted time frame or not. This analysis found that when using the variance in these time frames as the only predictor in a logistic regression model, the slope coefficient associated with the variance was statistically significant at the \(\alpha = 0.01\) significance level. With that though, the cross validated accuracy of the model was only around 0.6 suggesting that it wasn’t super strong in practice. In order to understand the impact that variance in running imbalance had on whether or not a player returned in the predicted time frame or not, we performed a bootstrap. We separated the observations in which players did and did not return in the predicted time frame into two different data sets. We then sampled from each of these two data sets and calculated the average variance for each sample. This was repeated 5000 times. This gave us an estimate of the average variance in running imbalance for players who returned within the predicted time frame and those who did not. This bootstrap revealed that players who did not return within the predicted time frame had a variance greater by roughly 1.2 during their time of recovery than those who returned on time. This tells us that while variance in running imbalance is not directly strong enough to predict whether or not an athlete will return within the predicted time frame, it can be used to supplement prognosis or make adjustments to the prognosis during the time of recovery of a HSI.

#Calculating date back to play in incident report data set
Incident_Report_clean <- Incident_Report_clean %>%
  filter(!is.na(Injury.Prognosis))%>%
  #calculating how long predicted time loss is based on prognosis
         #beginning of predicted range of return
  mutate(Expected.Start.Return = as.Date(ifelse(Injury.Prognosis=="No Expected Time Loss",
                                        Date.of.Injury,
                                        ifelse(Injury.Prognosis=="Less than 1 Week",
                                               Date.of.Injury,
                                               ifelse(Injury.Prognosis=="1-4 Weeks",
                                                      Date.of.Injury+days(7),
                                                      Date.of.Injury+days(28))))),
         #end of predicted range of return
         Expected.End.Return = as.Date(ifelse(Injury.Prognosis=="No Expected Time Loss",
                                        Date.of.Injury,
                                        ifelse(Injury.Prognosis=="Less than 1 Week",
                                               Date.of.Injury+days(7),
                                               ifelse(Injury.Prognosis=="1-4 Weeks",
                                                      Date.of.Injury+days(28),
                                                      Date.of.Injury+days(56)))))) %>%
  group_by(anon_id, Date.of.Injury) %>%
  #calculating actual date cleared to return
  mutate(Actual.Return = Date.of.Injury+days(sum(na.omit(Days.in.Status)))) %>%
  ungroup()
for(i in 1:22){
  #looking at mean running imbalance per week before and after injury for each injured player
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Running.Imbalance)) +
  geom_line() +
    #Marking when injury occurred with gold line
  geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury, color="#CFB87C") +
    #Marking actual return date with solid brown line
  geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Actual.Return, color="#8D7334", linetype=1) +
  #Marking beginning of predicted return to play range with grey dotted line
  geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.Start.Return, color="#A2A4A3", linetype=3) +
    #Marking end of predicted return to play range with grey dotted line
  geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.End.Return, color="#A2A4A3", linetype=3) +
    labs(title=injured_IDs[i]) +
    annotate("rect", 
             xmin = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.Start.Return,
             xmax = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.End.Return,
             ymin=min(Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],]$Running.Imbalance), ymax=max(Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],]$Running.Imbalance), alpha=0.3) +
    theme_minimal()
  
  print(p)
}
#looking at running imbalance of only injured players
Injured_Historical_Running <- Historical_Running_clean %>%
  filter(anon_id %in% injured_IDs)

#Making binary column if date was in time range of injury prognosis
Injured_Historical_Running$Date.in.Range <- rep(0, 1658)

#go through all of the injured players in the data set
for(i in 1:22){
  #get the dates each player was injured
  injury_dates <- unique(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)
  
  #go through dates of injury for each player
  for(j in 1:length(injury_dates)){
    if(injured_IDs[i] == "ID_50"){ #ID_50 does not have enough running imbalance data
      break
    }
    #get the expected date of return for that instance of injury
    expected_return <- as.Date(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i] & Incident_Report_clean$Date.of.Injury==injury_dates[j],]$Expected.End.Return[1])
    
    #if the date in running imbalance is between day of injury and last day of prediction, set as 1
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] & Injured_Historical_Running$Date >= injury_dates[j] & Injured_Historical_Running$Date <= expected_return,]$Date.in.Range <- 1
  }
}

#making a column for the variance in running imbalance for each injury instance range
Injured_Historical_Running <- Injured_Historical_Running %>%
  filter(Date.in.Range == 1) %>%
  group_by(anon_id, Injury.Count) %>%
  mutate(Injury.Variability = var(Running.Imbalance)) %>%
  ungroup()

#adding injury and running data sets together
Injured_Data <- left_join(Injured_Historical_Running, Incident_Report_clean, by=c("anon_id", "Injury.Count"), relationship = "many-to-many") %>%
  #making new column if player returned in predicted time frame
  mutate(Return.in.Range = ifelse(Actual.Return>=Expected.Start.Return & Actual.Return <= Expected.End.Return, 1, 0)) %>%
  #removing rows that are missing important data
  filter(!is.na(Injury.Variability),
         !is.na(Return.in.Range))

Logistic Regression Model

set.seed(1000)
#making a 75% to 25% training to testing split
rows <- sample(1:nrow(Injured_Data), size=(nrow(Injured_Data)*0.75), replace=FALSE)
Injured_train <- Injured_Data[rows,]
Injured_test <- Injured_Data[-rows,]

#building logistic regression model from training data
return_to_play_model <- glm(Return.in.Range~Injury.Variability, data=Injured_train, family="binomial")

#looking at coefficients and p-values
summary(return_to_play_model)

#making predictions on testing data based off of the model built
Injured_test <- Injured_test %>%
  mutate(Prediction = ifelse(predict(return_to_play_model, newdata=Injured_test, type="response")>0.5, 1, 0))

#calculating CER
Injured_test %>%
  summarize(CER = mean(Prediction != Return.in.Range))
#making model with all of the data
return_to_play_cv <- glm(Return.in.Range~Injury.Variability, data=Injured_Data, family="binomial")

#making cost function for CER
cost <- function(obs, pred){
  mean((pred <= 0.5) & obs==1 | (pred > 0.5) & obs==0)
}

set.seed(1000)

#cross validating with K=10
ten_cv <- cv.glm(data=Injured_Data,glmfit=return_to_play_cv,cost,K=10)

#extract average error
ten_cv$delta[1]

Bootstrapping Differences Between In and Out of Range Return to Plays

#taking only players who recovered in predicted time frame
In_Range <- Injured_Data %>%
  filter(Return.in.Range == 1,
         !is.na(Injury.Variability))

#taking only players who did not recover in the predicted time frame
Out_Range <- Injured_Data %>%
  filter(Return.in.Range == 0,
         !is.na(Injury.Variability))
#making data frame to hold values
Range_Variances <- data.frame(in.avg.var = rep(NA, 5000),
                              out.avg.var = rep(NA, 5000),
                              diff.avg.var = rep(NA, 5000))

#bootstrapping 5000 times
for(i in 1:5000){
  set.seed(i)
  
  #sample from players who recovered in predicted range
  in_sample <- sample_n(In_Range, size=308, replace=TRUE)
  #sample fro players who did not recover in predicted range
  out_sample <- sample_n(Out_Range, size=477, replace=TRUE)
  
  
  #calculating variances of each sample and storing in data frame
  Range_Variances[i,1] <- mean(in_sample$Injury.Variability)
  Range_Variances[i,2] <- mean(out_sample$Injury.Variability)
  Range_Variances[i,3] <- Range_Variances[i,2] - Range_Variances[i,1] #diff in variances
}
#plotting differences in average variance
ggplot(data=Range_Variances, aes(diff.avg.var)) +
  geom_histogram() +
  #adding in 90% CI (does not include 0)
  geom_vline(xintercept = quantile(Range_Variances$diff.avg.var, 0.05), color ="#CFB87C") +
  geom_vline(xintercept = quantile(Range_Variances$diff.avg.var, 0.95), color ="#CFB87C") +
  labs(title = "Estimated Difference in Average Variance of Running Imbalance",
       x= "Difference in Average Variance Running Imbalance") +
  theme_minimal()

#plotting average variances of difference groups
ggplot(data=Range_Variances) +
  #players who recovered in predicted time frame
  geom_histogram(aes(in.avg.var), alpha = 0.75) +
  #players who did not recover in the predicted time frame
  geom_histogram(aes(out.avg.var), alpha = 0.75, fill ="#CFB87C") +
  labs(title = "Estimated Average Variance in Running Imbalance", 
       x="Average Variance in Running Imbalance") +
  theme_minimal()
remove(p,i,j,rows,ten_cv, cost, Injured_Data, Injured_Historical_Running, Injured_test, Injured_train, Range_Variances, return_to_play_cv, return_to_play_model, expected_return, injury_dates, In_Range, Out_Range, in_sample, out_sample)
---
title: "Data Analysis Notebook"
output: html_notebook
---

```{r}
#Authors: Ian McElveen and Cecilia Gonzales
#Author Date: 7/14/2025
#Purpose: The purpose of this notebook is to house all data set transformation, cleansing, visualization, statistical analysis, and note-taking for the 2025 CU Athletic Department Sports Science Internship Program

#LAST UPDATED: 8/5/2025

#Including helpful libraries
library(tidyverse)
library(readxl)
library(aod)
library(gt)
library(boot)
library(mgcv)
library(lme4)
library(leaps)
```


# Data Cleaning
```{r Loading in the data sets}
#loading in the Catapult data to look at sprinting values
Catapult_Session <- read_csv("data-sets/data-sets-uncompressed/data-sets-compressed/Running Imbalance and Speed/Catapult Session - Outdoor FB.csv")

#loading in the Historical Running data to look at running imbalance values
Historical_Running <- read_csv("data-sets/data-sets-uncompressed/data-sets-compressed/Running Imbalance and Speed/Compiled Historical Running Imbalance FB.csv")

#loading in the Incident Report to look at HSIs
Incident_Report <- read_csv("data-sets/data-sets-uncompressed/data-sets-compressed/Running Imbalance and Speed/Incident Report FB IDs.csv")

```

```{r Cleaning Catapult_Session}
Catapult_Session_clean <- Catapult_Session %>%
  #putting the date as a date class
  mutate(Date = as.Date(Date, "%m/%d/%Y")) %>%
  #only selecting important columns for this analysis
  select(anon_id, Date, Age, Primary.Position, Total.Distance, Period.Name, Total.Duration..min., Velocity.Band.1.Total.Distance, Velocity.Band.2.Total.Distance, Velocity.Band.3.Total.Distance, Velocity.Band.4.Total.Distance, Velocity.Band.5.Total.Distance, Velocity.Band.6.Total.Distance, Velocity.Band.7.Total.Distance, Velocity.Band.8.Total.Distance, Velocity.Band.2.Total.Effort.Count, Velocity.Band.3.Total.Effort.Count, Velocity.Band.4.Total.Effort.Count, Velocity.Band.5.Total.Effort.Count, Velocity.Band.6.Total.Effort.Count, Velocity.Band.7.Total.Effort.Count, Velocity.Band.8.Total.Effort.Count, Maximum.Velocity, Average.Velocity, Hit.90.Percent.Max, Date.of.Last.90.Effort, Days.Since.Last.90.Effort, Hit.Max.Velocity., Date.of.All.Time.Max.Velocity, Days.Since.Max.Velocity, Session.Max.Velocity) %>%
  #calculating each player's maximum velocity
  group_by(anon_id) %>%
  mutate(Player.Max.Velocity = max(na.omit(Maximum.Velocity))) %>%
  ungroup() %>%
  #only selecting data from January 1, 2024 and on
  filter(Date >= "2024-01-01")

head(Catapult_Session_clean)
```

```{r Cleaning Historical_Running}
Historical_Running_clean <- Historical_Running %>%
  #taking out rows that don't have data
  filter(Running.Imbalance != "n/a") %>%
  #putting running imbalance as a number and converting the date to a date class
  mutate(Running.Imbalance = as.numeric(Running.Imbalance),
         Date = as.Date(Date, "%m/%d/%Y")) %>%
  #only using data from January 1, 2024 and on
  filter(Date >= "2024-01-01") %>%
  mutate(X=1:4063) %>%
  #making days since January 1, 2024 for each player
  group_by(anon_id) %>%
  mutate(Days.Since.Start = as.numeric(Date - min(Date))) %>%
  ungroup()

head(Historical_Running_clean)
```

```{r Cleaning Incident_Report}
Incident_Report_clean <- Incident_Report %>%
  #filtering for only hamstring injuries
  filter(OSICS14.Code == "TM1",
         Status != "Full Go")  %>%
  #getting the date of the injury as a date class
  mutate(Date = as.Date(Date, "%m/%d/%Y"),
         Date.of.Injury = as.Date(Date.of.Injury...Onset.of.symptoms, "%m/%d/%Y"),
         Examination.Date = as.Date(Examination.Date, "%m/%d/%Y")) %>%
  #only selecting relevant columns for this analysis
  select(anon_id, Position, Date, Date.of.Injury, Time.of.Injury, Side, OSICS.Injury.Diagnosis, Coach.s.Diagnosis, Recurrence.of.Injury, Choose.Season, Onset.of.Symptoms, Injury.Prognosis, General.Mechanism, Specific.Mechanism, Injured.While., Type.of.Event, Season., Status, Days.in.Status) %>%
  #making days out due to injury for each player and each injury they sustained
  group_by(anon_id, Date.of.Injury) %>%
  mutate(Days.Out = sum(Days.in.Status)) %>%
  ungroup()

head(Incident_Report_clean)
```

```{r Identifying players that have data in both data sets}
#taking the IDs of players who are and aren't injured
all_IDs <- unique(Historical_Running_clean$anon_id)
#taking IDs that were injured and also have running imbalance data
injured_IDs <- intersect(unique(Incident_Report_clean$anon_id), all_IDs)
#taking all players with running imbalance data that don't have an injury
uninjured_IDs <- unique((Historical_Running_clean %>%
  filter(!anon_id %in% injured_IDs))$anon_id)
```

```{r Filtering out players without proper data from all data sets}
#injured players who also have running imbalance data
Incident_Report_clean <- Incident_Report_clean %>%
  filter(anon_id %in% injured_IDs)

#all players that only have running imbalance data or have both running imbalance data and incidence report
Historical_Running_clean <- Historical_Running_clean %>%
  filter(anon_id %in% injured_IDs | anon_id %in% uninjured_IDs)
```

```{r Removing all unimportant objects created during cleaning}
#removing uncleaned data sets
remove(Incident_Report)
remove(Historical_Running)
remove(Catapult_Session)
```

# Section 1: Running Speed

## How often are athletes reaching ≥ 90% maximum velocity throughout a training season?

```{r}
# Bar chart for how often players reach ≥ 90% maximum velocity

# Count for how many times each anon_id hit ≥ 90% maximum velocity
hit_90_counts <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>% # Filter for training season
  filter(Hit.90.Percent.Max == "Yes") %>%
  distinct(anon_id, Date, Primary.Position) %>%
  group_by(anon_id, Primary.Position) %>%
  summarise(times_hit_90 = n(), .groups = "drop") %>%
  mutate(
    Position_Group = case_when(
      Primary.Position %in% c("QB", "LB", "TE", "RB") ~ "COMBO",
      Primary.Position %in% c("OL", "DL", "DE") ~ "BIGS",
      Primary.Position %in% c("WR", "DB", "DB, WR") ~ "SKILL",
      TRUE ~ "OTHER"
    ))

# Plot of all players' frequencies
ggplot(hit_90_counts, aes(x = anon_id, y = times_hit_90)) +
  geom_bar(stat = "identity", fill = "#CFB87C") +
  geom_hline(yintercept = mean(hit_90_counts$times_hit_90), 
             linetype = "dashed", color = "#565A5C", linewidth = 0.5) +
  labs(title = "Player Counts for Achieving ≥ 90% of Maximum Velocity During 2024–25 Season", 
       subtitle = paste("Team Average:", round(mean(hit_90_counts$times_hit_90), 1)),
       x = "Athlete ID", y = "Times ≥ 90%") +
  theme_classic() +
  theme(axis.text.x = element_text(size = 6, angle = 90))
```
# Do not need because we created a function to plot each position
```{r}
# Bar chart of times reached >90%, Quarterbacks

# Filter to only have data for QBs
QBs <- hit_90_counts %>%
  filter(Primary.Position == "QB")

# Calculate the averages first
overall_avg <- mean(hit_90_counts$times_hit_90)
qb_avg <- mean(QBs$times_hit_90)

ggplot(QBs, aes(x=anon_id, y=times_hit_90)) +
  geom_bar(stat="identity", fill = "#CFB87C") + 
  geom_text(aes(label = times_hit_90), 
            vjust = -0.5, 
            size = 3.5) +
  geom_hline(yintercept = overall_avg, 
             linetype = "dashed", color = "#000000") +
  geom_hline(yintercept = qb_avg, 
             linetype = "dashed", color = "#565A5C") +
  annotate("text", x = 1, y = overall_avg + 0.5, 
           label = "Team Avg", color = "#000000", size = 3) +
  annotate("text", x = 1, y = qb_avg + 0.5, 
           label = "QB Avg", color = "#565A5C", size = 3) +
  labs(title = "QB Counts for Reaching ≥90% Max Velocity", 
       x = "Athlete ID", y = "Times ≥90%",
       subtitle = paste("QB Average:", qb_avg)) +
  theme_classic()

```

## Facet plot useful, but hard to read so probably don't include in presentation.
```{r}
# Facet Plot

# Calculate overall average
overall_avg <- mean(hit_90_counts$times_hit_90)

# Plot faceted bar charts by position
ggplot(hit_90_counts, aes(x = anon_id, y = times_hit_90)) +
  geom_bar(stat = "identity", fill = "#CFB87C") +
  geom_text(aes(label = times_hit_90), vjust = -0.5, size = 3.5) +
  geom_hline(yintercept = overall_avg, linetype = "dashed", color = "#000000") +
  annotate("text", x = 1, y = overall_avg + 0.5, 
           label = "Team Avg", color = "#000000", size = 3, hjust = 0) +
  facet_wrap(~ Primary.Position, scales = "free_x") +
  labs(title = "Counts of ≥90% Max Velocity by Player and Position",
       y = "Times ≥90% Max Velocity",
       x = "Athlete ID") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

```{r}
# Plots for each position

# Get the overall team average once from hit_90_counts
team_avg <- mean(hit_90_counts$times_hit_90)

# Define  positions to loop through
positions <- unique(hit_90_counts$Primary.Position)

# Plotting function:
plot_hit_90_by_position <- function(pos) {
  position_data <- hit_90_counts %>%
    filter(Primary.Position == pos)
  
  pos_avg <- mean(position_data$times_hit_90)
  
  ggplot(position_data, aes(x = anon_id, y = times_hit_90)) +
    geom_bar(stat = "identity", fill = "#CFB87C") +
    geom_text(aes(label = times_hit_90), vjust = -0.5, size = 3.5) +
    geom_hline(yintercept = pos_avg, linetype = "dashed", color = "#565A5C") +
    annotate("text", x = 1, y = pos_avg + 0.5, 
             label = "Pos Avg", color = "#565A5C", size = 3, hjust = 0) +
    geom_hline(yintercept = team_avg, linetype = "dashed", color = "#000000") +
    annotate("text", x = 1, y = team_avg + 0.5, 
             label = "Team Avg", color = "#000000", size = 3, hjust = 0) +
    labs(title = paste("Times ≥90% Max Velocity –", pos),
         subtitle = paste0("Position Avg: ", round(pos_avg, 1),
                           " | Team Avg: ", round(team_avg, 1)),
         y = "Count",
         x = "anon_id") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

# Loop through each position and print the plot
plots_by_position <- lapply(positions, function(pos) {
  print(plot_hit_90_by_position(pos))
})

```

```{r}
# Table for average values of each position

position_averages <- hit_90_counts %>%
  group_by(Primary.Position) %>%
  summarise(avg_times_hit_90 = mean(times_hit_90), .groups = "drop") %>%
  arrange(desc(avg_times_hit_90))

position_averages_with_team <- bind_rows(
  tibble(
    Primary.Position = "Team Average",
    avg_times_hit_90 = team_avg
  ),
  position_averages
)

position_averages_with_team %>%
  gt() %>%
  tab_header(
    title = "Average Times ≥90% Max Velocity by Position and Team"
  )
```

DBs have the highest average times reaching ≥90% of Max velocity (18.4). The OL had the lowest (6.8). Team average was 11.8.

```{r}
# Plot counts for each position group

# Get the overall team average once from hit_90_counts
team_avg <- mean(hit_90_counts$times_hit_90)

# Define  positions to loop through
positions_groups <- unique(hit_90_counts$Position_Group)

# Plotting function:
plot_hit_90_group <- function(pos) {
  group_data <- hit_90_counts %>%
    filter(Position_Group == pos)
  
  group_avg <- mean(group_data$times_hit_90)
  
  ggplot(group_data, aes(x = anon_id, y = times_hit_90)) +
    geom_bar(stat = "identity", fill = "#CFB87C") +
    geom_text(aes(label = times_hit_90), vjust = -0.5, size = 3.5) +
    geom_hline(yintercept = group_avg, linetype = "dashed", color = "#565A5C") +
    annotate("text", x = 1, y = group_avg + 0.5, 
             label = "Group Avg", color = "#565A5C", size = 3, hjust = 0) +
    geom_hline(yintercept = team_avg, linetype = "dashed", color = "#000000") +
    annotate("text", x = 1, y = team_avg + 0.5, 
             label = "Team Avg", color = "#000000", size = 3, hjust = 0) +
    labs(title = paste("Times ≥90% Max Velocity –", pos),
         subtitle = paste0("Group Avg: ", round(group_avg, 1),
                           " | Team Avg: ", round(team_avg, 1)),
         y = "Count",
         x = "anon_id") +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
}

# Loop through each position and print the plot
plots_by_group <- lapply(positions_groups, function(pos) {
  print(plot_hit_90_group(pos))
})

```
The skill position group has the highest average times reached ≥90% Max with 17.1, then combo with 9.3, then bigs with 8.9.

```{r}
# Make facet plot for position groups so that it fits on the presentation slides better.

# Calculate averages for each group and the team
group_avgs <- hit_90_counts %>%
  group_by(Position_Group) %>%
  summarise(group_avg = mean(times_hit_90))

team_avg <- mean(hit_90_counts$times_hit_90)

# Join group averages back to the main data
plot_data <- hit_90_counts %>%
  left_join(group_avgs, by = "Position_Group")

# Create label data with formatted strings
avg_labels <- plot_data %>%
  group_by(Position_Group) %>%
  summarise(
    group_avg = unique(group_avg),
    team_avg = team_avg,
    label_x = 1,
    group_label_y = group_avg + 1,
    team_label_y = team_avg + 1,
    group_label = round(group_avg, 1),
    team_label = round(team_avg, 1),
    .groups = "drop"
  )

# Plot
ggplot(plot_data, aes(x = anon_id, y = times_hit_90)) +
  geom_bar(stat = "identity", fill = "#CFB87C") +
  geom_hline(aes(yintercept = group_avg), linetype = "dashed", color = "#000000") +
  geom_hline(yintercept = team_avg, linetype = "dotted", color = "#565A5C") +
  geom_text(data = avg_labels, aes(x = label_x, y = group_label_y, label = group_label),
            inherit.aes = FALSE, hjust = 0, size = 3, color = "#000000") +
  geom_text(data = avg_labels, aes(x = label_x, y = team_label_y, label = team_label),
            inherit.aes = FALSE, hjust = 0, size = 3, color = "#565A5C") +
  facet_wrap(~ Position_Group, scales = "free_x") +
  labs(title = "Times Reached ≥90% Max Velocity by Position Group",
       y = "Count", x = "anon_id") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
```

```{r}
# Create position group avg count table

# Create team average row
team_row <- tibble(
  Group = "Team Avg",
  Average = team_avg
)

# Create group average rows
group_rows <- hit_90_counts %>%
  group_by(Position_Group) %>%
  summarise(Average = mean(times_hit_90), .groups = "drop") %>%
  arrange(desc(Average)) %>%  # Sort from highest to lowest
  rename(Group = Position_Group)

# Combine them (team first)
combined_table <- bind_rows(team_row, group_rows)

# View the table
combined_table
```
No surprise that the skill position group has the highest average counts. 

```{r}
# >90% counts over the course of the season

# Trying to find out when are players reaching above 90% the most

# Create dataset that has the count of >90% of each day
daily_90_counts <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
  filter(Hit.90.Percent.Max == "Yes") %>%
  distinct(anon_id, Date) %>%
  group_by(Date) %>%
  summarise(daily_hits = n())

# Plot
ggplot(daily_90_counts, aes(x = Date, y = daily_hits)) +
  geom_line(color = "#CFB87C", linewidth = 1) +
  geom_smooth(method = "loess", se = FALSE, color = "#565A5C", linetype = "dashed") +
  labs(title = "Daily Count of Players Reaching ≥ 90% of Max Velocity",
       subtitle = "Over the 2024–25 Training Season",
       x = "Date", y = "Times reached ≥ 90%") +
  theme_classic()
```
Data is noisy, so lets try grouping by week instead of every day.

```{r}
# >90% counts for each week instead of each day. Will be a little less noisy than the daily
weekly_90_counts <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
  filter(Hit.90.Percent.Max == "Yes") %>%
  distinct(anon_id, Date) %>%
  mutate(week = floor_date(Date, unit = "week")) %>%
  group_by(week) %>%
  summarise(weekly_hits = n())

mean_val <- round(mean(weekly_90_counts$weekly_hits), 1)

ggplot(weekly_90_counts, aes(x = week, y = weekly_hits)) +
  geom_line(color = "#CFB87C", linewidth = 1) +
  geom_point(color = "#000000") +
  geom_smooth(method = "loess", se = FALSE, color = "#565A5C", linetype = "dashed", linewidth = 0.6) +
  labs(title = "Weekly Count of Players Reaching ≥ 90% of Max Velocity",
       subtitle = paste0("Week Average: ", mean_val),
       x = "Week", y = "Times reached ≥ 90%") +
  theme_classic()
```


```{r}
# Bar chart of weekly ≥90% counts

ggplot(weekly_90_counts, aes(x = week, y = weekly_hits)) +
  geom_col(fill = "#CFB87C", color = "#565A5C", width = 5) +
  geom_text(aes(label = weekly_hits), 
            vjust = -0.4, 
            size = 2.5, 
            color = "black") +
  geom_hline(yintercept = mean_val,
             linetype = "dashed", color = "#565A5C", linewidth = 0.5) +
  annotate("text", 
           x = min(weekly_90_counts$week) + 7,  # Adjust this date to place the label
           y = mean_val + 1.5,                 # Slightly above the line
           label = paste("Mean:", round(mean_val, 1)),
           size = 3.25, color = "#000000") +
  labs(
    title    = "Weekly Count of Players Reaching ≥ 90% of Max Velocity",
    subtitle = paste("Season Average per Week:", round(mean(weekly_90_counts$weekly_hits), 1)),
    x        = "Week",
    y        = "Times reached ≥ 90%"
  ) +
  scale_x_date(date_breaks = "2 weeks", date_labels = "%b %d") +
  theme_classic() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1)
  )
```
Some of our lowest average counts for a week seem to be during the season, whereas some of our highest counts seem to be pre and post-season.

## Should we consider the number of sprinting efforts that athletes are completing?

How often athletes are hitting which velocity bands in relation to their top speeds.

```{r}
# Create sum of weekly band totals

# Make sure week is defined
Catapult_Session_clean <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>% 
  mutate(week = floor_date(Date, unit = "week"))

# Sum efforts by week and athlete
weekly_velocity_efforts <- Catapult_Session_clean %>%
  group_by(anon_id, week) %>%
  summarise(
    V2 = sum(Velocity.Band.2.Total.Effort.Count, na.rm = TRUE),
    V3 = sum(Velocity.Band.3.Total.Effort.Count, na.rm = TRUE),
    V4 = sum(Velocity.Band.4.Total.Effort.Count, na.rm = TRUE),
    V5 = sum(Velocity.Band.5.Total.Effort.Count, na.rm = TRUE),
    V6 = sum(Velocity.Band.6.Total.Effort.Count, na.rm = TRUE),
    V7 = sum(Velocity.Band.7.Total.Effort.Count, na.rm = TRUE),
    V8 = sum(Velocity.Band.8.Total.Effort.Count, na.rm = TRUE),
    total_efforts = V2 + V3 + V4 + V5 + V6 + V7 + V8,
    Weekly_Max_Velocity = max(Maximum.Velocity, na.rm = TRUE),
    Player_Max_Velocity = first(Player.Max.Velocity),
    .groups = "drop"
  ) %>%
  mutate(
    pct_of_max_velocity = (Weekly_Max_Velocity / Player_Max_Velocity) * 100
  )

# Create percentage of weekly efforts in each band
weekly_velocity_efforts <- weekly_velocity_efforts %>%
  mutate(
    pct_V2 = (V2 / total_efforts) * 100,
    pct_V3 = (V3 / total_efforts) * 100,
    pct_V4 = (V4 / total_efforts) * 100,
    pct_V5 = (V5 / total_efforts) * 100,
    pct_V6 = (V6 / total_efforts) * 100,
    pct_V7 = (V7 / total_efforts) * 100,
    pct_V8 = (V8 / total_efforts) * 100
  )

weekly_velocity_efforts <- weekly_velocity_efforts %>%
  filter(total_efforts > 0)
```


```{r}
# Plot for each player

# Loop over all players and display their plots
unique(weekly_velocity_efforts$anon_id) %>%
  lapply(function(player_id) {
    
    # Prepare the data for one player
    player_weekly_data <- weekly_velocity_efforts %>%
      filter(anon_id == player_id) %>%
      select(week, pct_V2:pct_V8, pct_of_max_velocity) %>%
      pivot_longer(
        cols = starts_with("pct_V"),
        names_to = "velocity_band",
        values_to = "percent_effort"
      ) %>%
      mutate(
        velocity_band = factor(
          velocity_band,
          levels = paste0("pct_V", 8:2),
          labels = paste0("V", 8:2)
        )
      )
    
    # Skip empty data
    if (nrow(player_weekly_data) == 0) return(NULL)
    
    # Generate and print the plot
    plot <- ggplot(player_weekly_data, aes(x = week)) +
      geom_col(aes(y = percent_effort, fill = velocity_band), position = "stack") +
      geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
      geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
      scale_fill_manual(
        values = c(
          "V2" = "darkgreen",
          "V3" = "green2",
          "V4" = "greenyellow",
          "V5" = "yellow",
          "V6" = "orange",
          "V7" = "tomato",
          "V8" = "firebrick"
  )
) +
    scale_y_continuous(
        name = "Band Distribution (%)",
        sec.axis = sec_axis(~ ., name = "Weekly Max Velocity (% of PR)")
      ) +
      labs(
        title = paste("Velocity Band Effort % and Speed Trend for Player", player_id),
        x = "Week",
        fill = "Velocity Band"
      ) +
      theme_classic()
    
    print(plot)  # Display plot
    
    return(NULL)
  })
```

Each player is different in reaching different velocity bands in relation to their top speeds. Some players, ID_11 for example, rarely enter bands 6 or higher, even if they are close to 100% of their max velocity. This shows the issues with using absolute bands for the whole team.


```{r}
# Filter for ID_11 and reshape the data
player_weekly_data <- weekly_velocity_efforts %>%
  filter(anon_id == "ID_11") %>%
  select(week, pct_V2:pct_V8, pct_of_max_velocity) %>%
  pivot_longer(
    cols = starts_with("pct_V"),
    names_to = "velocity_band",
    values_to = "percent_effort"
  ) %>%
  mutate(
    velocity_band = factor(
      velocity_band,
      levels = paste0("pct_V", 8:2),
      labels = paste0("V", 8:2)
    )
  )

# Plot for ID_11
ggplot(player_weekly_data, aes(x = week)) +
  geom_col(aes(y = percent_effort, fill = velocity_band), position = "stack") +
  geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
  geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
  scale_fill_manual(
    values = c(
      "V2" = "darkgreen",
      "V3" = "green2",
      "V4" = "greenyellow",
      "V5" = "yellow",
      "V6" = "orange",
      "V7" = "tomato",
      "V8" = "firebrick"
    )
  ) +
  scale_y_continuous(
    name = "Band Distribution (%)",
    sec.axis = sec_axis(~ ., name = "Weekly Max Velocity (% of PR)")
  ) +
  labs(
    title = "Velocity Band Effort % and Speed Trend for Player ID_11",
    x = "Week",
    fill = "Velocity Band"
  ) +
  theme_classic()
```


```{r}
# Pick one player
player_id <- "ID_93"

# Filter data for that player and weeks
player_data <- weekly_velocity_efforts %>%
  filter(anon_id == player_id)

# Plot of Total Sprinting Efforts per Week
ggplot(player_data, aes(x = week, y = total_efforts)) +
  geom_line(color = "#CFB87C", size = 1) +
  geom_point(color = "#CFB87C", size = 2) +
  labs(
    title = paste("Total Sprinting Efforts per Week for Player", player_id),
    x = "Week",
    y = "Total Sprinting Efforts"
  ) +
  theme_classic()

#Plot of Sprint Effort Distribution by Velocity Band

# Reshape absolute counts to long format
player_counts_long <- player_data %>%
  select(week, V2, V3, V4, V5, V6, V7, V8) %>%
  pivot_longer(
    cols = V2:V8,
    names_to = "velocity_band",
    values_to = "effort_count"
  ) %>%
  mutate(
    velocity_band = factor(velocity_band, levels = paste0("V", 8:2))
  )

ggplot(player_counts_long, aes(x = week, y = effort_count, fill = velocity_band)) +
  geom_col(position = "stack") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = paste("Sprint Effort Distribution by Velocity Band for Player", player_id),
    x = "Week",
    y = "Sprint Effort Count",
    fill = "Velocity Band"
  ) +
  theme_classic()
```
#### Exploring correlations between bands and max speeds

```{r}


# Select relevant columns
cor_data <- weekly_velocity_efforts %>%
  select(pct_of_max_velocity, pct_V2, pct_V3, pct_V4, pct_V5, pct_V6, pct_V7, pct_V8)

# Compute correlation matrix
cor_matrix <- cor(cor_data, use = "pairwise.complete.obs")

cor_matrix


# Compute correlations with pct_of_max_velocity
cor_values <- cor_data %>%
  summarise(across(-pct_of_max_velocity,
                   ~ cor(.x, cor_data$pct_of_max_velocity, use = "pairwise.complete.obs"))) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "correlation")

# Plot with labels
ggplot(cor_values, aes(x = reorder(variable, correlation), y = correlation)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = round(correlation, 2)), 
            hjust = ifelse(cor_values$correlation >= 0, -0.1, 1.1), 
            color = "black", size = 3) +
  coord_flip() +
  geom_hline(yintercept = 0, color = "black", linetype = "dashed") +
  labs(
    title = "Correlation Between Percent in Band and Percent of Max Velocity",
    x = "Band",
    y = "Pearson Correlation"
  ) +
  theme_classic() 
```
V4 and V5 appear to be slightly more predictive of top-speed achievement than V6, V7 or V8 — possibly because they're more frequently reached zones.

V2 has a moderately strong negative linear relationship with reaching top-speed. When athletes spend more effort in this low-speed band, their weekly top speed (as % of max) tends to be lower.

```{r}
# Linear model with all bands predicting pct_of_max_velocity
model_all_bands <- lm(
  pct_of_max_velocity ~ pct_V2 + pct_V3 + pct_V4 + pct_V5 + pct_V6 + pct_V7 + pct_V8,
  data = weekly_velocity_efforts
)
summary(model_all_bands)
```
```{r}
# Compute change in top speeds week-to-week
weekly_velocity_efforts <- weekly_velocity_efforts %>%
  arrange(anon_id, week) %>%
  group_by(anon_id) %>%
  mutate(
    pct_of_max_velocity_change = pct_of_max_velocity - lag(pct_of_max_velocity)
  ) %>%
  ungroup()
```

```{r}
# Scatterplot with smoothing line for V8 band effort

# Create V8 data set that only includes when someone was in V8 for a given week
V8 <- weekly_velocity_efforts %>%
  filter(pct_V8 > 0)

# Plot
ggplot(V8, aes(x = pct_V8, y = pct_of_max_velocity)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", se = TRUE, color = "darkred") +
  labs(
    title = "Percent of Max Velocity vs. V8 Band Usage",
    x = "Percent of Total Effort in V8 Band (%)",
    y = "Percent of Max Velocity"
  ) +
  theme_classic()


# Scatterplot for V2 band effort vs. Percent of Max Velocity
# Create V2 data set that only includes when someone was in V2 for a given week
V2 <- weekly_velocity_efforts %>%
  filter(pct_V2 > 0)

# Plot
ggplot(V2, aes(x = pct_V2, y = pct_of_max_velocity)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "loess", se = TRUE, color = "darkred") +
  labs(
    title = "Percent of Max Velocity vs. V2 Band Usage",
    x = "Percent of Total Effort in V2 Band (%)",
    y = "Percent of Max Velocity"
  ) +
  theme_classic()
```

```{r}
weekly_velocity_efforts <- weekly_velocity_efforts %>%
  group_by(anon_id) %>%
  mutate(
    lag_V2 = lag(V2),
    lag_V3 = lag(V3),
    lag_V4 = lag(V4),
    lag_V5 = lag(V5),
    lag_V6 = lag(V6),
    lag_V7 = lag(V7),
    lag_V8 = lag(V8),
    
    lag_pct_V2 = lag(pct_V2),
    lag_pct_V3 = lag(pct_V3),
    lag_pct_V4 = lag(pct_V4),
    lag_pct_V5 = lag(pct_V5),
    lag_pct_V6 = lag(pct_V6),
    lag_pct_V7 = lag(pct_V7),
    lag_pct_V8 = lag(pct_V8)
  ) %>%
  ungroup()
```

```{r}
# Maybe group by position

# Get Player positions
player_positions <- Catapult_Session_clean %>%
  select(anon_id, Primary.Position) %>%
  distinct()

# Join Position into weekly_velocity_efforts
weekly_velocity_efforts <- weekly_velocity_efforts %>%
  left_join(player_positions, by = "anon_id")

# Position groups
weekly_velocity_efforts <- weekly_velocity_efforts %>%
  mutate(
    Position_Group = case_when(
      Primary.Position %in% c("QB", "LB", "TE", "RB") ~ "COMBO",
      Primary.Position %in% c("OL", "DL", "DE") ~ "BIGS",
      Primary.Position %in% c("WR", "DB", "DB, WR") ~ "SKILL",
      TRUE ~ "OTHER"
    )
  ) %>%
  filter(Position_Group != "OTHER")

ggplot(weekly_velocity_efforts, aes(x = Position_Group, y = pct_of_max_velocity_change)) +
  geom_boxplot(fill = "skyblue", outlier.color = "red", outlier.size = 1.5) +
  labs(
    title = "Weekly Change in % of Max Velocity by Position",
    x = "Position",
    y = "% Change in Max Velocity"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

```{r}
weekly_band_effort_by_group <- weekly_velocity_efforts %>%
  group_by(week, Position_Group) %>%
  summarise(
    mean_V2 = mean(V2, na.rm = TRUE),
    mean_V3 = mean(V3, na.rm = TRUE),
    mean_V4 = mean(V4, na.rm = TRUE),
    mean_V5 = mean(V5, na.rm = TRUE),
    mean_V6 = mean(V6, na.rm = TRUE),
    mean_V7 = mean(V7, na.rm = TRUE),
    mean_V8 = mean(V8, na.rm = TRUE),
    
    mean_lag_V2 = mean(lag_V2, na.rm = TRUE),
    mean_lag_V3 = mean(lag_V3, na.rm = TRUE),
    mean_lag_V4 = mean(lag_V4, na.rm = TRUE),
    mean_lag_V5 = mean(lag_V5, na.rm = TRUE),
    mean_lag_V6 = mean(lag_V6, na.rm = TRUE),
    mean_lag_V7 = mean(lag_V7, na.rm = TRUE),
    mean_lag_V8 = mean(lag_V8, na.rm = TRUE),
    
    mean_pct_V2 = mean(pct_V2, na.rm = TRUE),
    mean_pct_V3 = mean(pct_V3, na.rm = TRUE),
    mean_pct_V4 = mean(pct_V4, na.rm = TRUE),
    mean_pct_V5 = mean(pct_V5, na.rm = TRUE),
    mean_pct_V6 = mean(pct_V6, na.rm = TRUE),
    mean_pct_V7 = mean(pct_V7, na.rm = TRUE),
    mean_pct_V8 = mean(pct_V8, na.rm = TRUE),
    mean_weekly_max_velocity = mean(Weekly_Max_Velocity, na.rm = TRUE),
    mean_pct_max_velocity = mean(pct_of_max_velocity, na.rm = TRUE),
    mean_pct_max_velocity_change = mean(pct_of_max_velocity_change, na.rm = TRUE),
    n_players = n()
  ) %>%
  ungroup()
```


```{r}

# Pivot longer to get bands in one column for easier plotting
band_long <- weekly_band_effort_by_group %>%
  pivot_longer(
    cols = starts_with("mean_pct_V"),
    names_to = "velocity_band",
    values_to = "mean_pct_effort"
  ) %>%
  mutate(
    velocity_band = factor(velocity_band, levels = paste0("mean_pct_V", 2:8))
  )

ggplot(band_long, aes(x = week, y = mean_pct_effort, color = velocity_band)) +
  geom_line(size = 1) +
  facet_wrap(~ Position_Group) +
  labs(
    title = "Weekly Mean % Effort in Velocity Bands by Position Group",
    x = "Week",
    y = "Mean % Effort",
    color = "Velocity Band"
  ) +
   scale_color_manual(
        values = c(
          "mean_pct_V2" = "darkgreen",
          "mean_pct_V3" = "green2",
          "mean_pct_V4" = "greenyellow",
          "mean_pct_V5" = "yellow",
          "mean_pct_V6" = "orange",
          "mean_pct_V7" = "tomato",
          "mean_pct_V8" = "firebrick"
  )
) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

This shows the mean percent of total efforts in each band over weeks by each position group.

```{r}
# Plot mean weekly max velocity
ggplot(weekly_band_effort_by_group, aes(x = week, y = mean_weekly_max_velocity, color = Position_Group)) +
  geom_line(size = 1.2) +
  labs(
    title = "Mean Weekly Max Velocity by Position Group",
    x = "Week",
    y = "Mean Weekly Max Velocity"
  ) +
  theme_classic()


# Plot mean % max velocity change
ggplot(weekly_band_effort_by_group, aes(x = week, y = mean_pct_max_velocity_change, color = Position_Group)) +
  geom_line(size = 1.2) +
  labs(
    title = "Mean % Change in Max Velocity by Position Group",
    x = "Week",
    y = "Mean % Change in Max Velocity"
  ) +
  theme_classic()
```

```{r}
ggplot() +
  # Plot the stacked lines for band effort %
  geom_line(data = band_long, aes(x = week, y = mean_pct_effort, color = velocity_band), size = 1) +
  geom_line(data = weekly_band_effort_by_group,
            aes(x = week, y = mean_weekly_max_velocity),
            color = "black", size = 1.2) +

  facet_wrap(~ Position_Group) +
  labs(
    title = "Weekly Mean % Effort in Velocity Bands by Position Group\nwith Mean Weekly Max Velocity",
    x = "Week",
    y = "Mean % Effort",
    color = "Velocity Band"
  ) +
  scale_color_manual(
        values = c(
          "mean_pct_V2" = "darkgreen",
          "mean_pct_V3" = "green2",
          "mean_pct_V4" = "greenyellow",
          "mean_pct_V5" = "yellow",
          "mean_pct_V6" = "orange",
          "mean_pct_V7" = "tomato",
          "mean_pct_V8" = "firebrick"
  )
) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

```{r}
# Define max effort and max velocity range
max_effort <- 67
max_velocity <- max(weekly_band_effort_by_group$mean_weekly_max_velocity, na.rm = TRUE)

ggplot() +
  geom_line(data = band_long, aes(x = week, y = mean_pct_effort, color = velocity_band), size = 1) +
  
  # Scale max_velocity to % effort scale to overlay on left axis
  geom_line(data = weekly_band_effort_by_group, 
            aes(x = week, y = (mean_weekly_max_velocity / max_velocity) * max_effort, color = "Max Velocity"),
            size = 1.1) +
  
  facet_wrap(~ Position_Group) +
  scale_y_continuous(
    name = "Mean % Effort",
    sec.axis = sec_axis(~ . * max_velocity / max_effort, name = "Mean Weekly Max Velocity (m/s)")
  ) +
  scale_color_manual(
    values = c(
      "mean_pct_V2" = "darkgreen",
      "mean_pct_V3" = "green2",
      "mean_pct_V4" = "greenyellow",
      "mean_pct_V5" = "yellow",
      "mean_pct_V6" = "orange",
      "mean_pct_V7" = "tomato",
      "mean_pct_V8" = "firebrick",
      "Max Velocity" = "black"
    )
  ) +
  labs(
    title = "Weekly Mean % Effort in Velocity Bands by Position Group\nwith Mean Weekly Max Velocity",
    x = "Week",
    color = "Velocity Band"
  ) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```



```{r}
# Modeling

# Separate by position group
bigs <- weekly_band_effort_by_group %>%
  filter(Position_Group == "BIGS")

skill <- weekly_band_effort_by_group %>%
  filter(Position_Group == "SKILL")

combo <- weekly_band_effort_by_group %>%
  filter(Position_Group == "COMBO")

# Bigs Model
model_bigs <- lm(
  mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, data = bigs
  )
summary(model_bigs)

# Skill Model
model_skill <- lm(
  mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, data = skill
  )
summary(model_skill)

# Combo Model
model_combo <- lm(
  mean_weekly_max_velocity ~ mean_lag_V2 + mean_lag_V3 + mean_lag_V4 + mean_lag_V5 + mean_lag_V6 + mean_lag_V7 + mean_lag_V8, data = combo)
summary(model_combo)
```

```{r}
# Group correlation:

# Select relevant variables
vars <- c("mean_weekly_max_velocity", "mean_lag_V2", "mean_lag_V3", 
          "mean_lag_V4", "mean_lag_V5", "mean_lag_V6", "mean_lag_V7", "mean_lag_V8")

# Correlation matrices
cor_bigs <- cor(bigs[, vars], use = "complete.obs")
cor_skill <- cor(skill[, vars], use = "complete.obs")
cor_combo <- cor(combo[, vars], use = "complete.obs")
round(cor_bigs, 2)
round(cor_skill, 2)
round(cor_combo, 2)
```


```{r}
# Reduce for better modeling
bigs <- bigs %>%
  mutate(
    low_band = (mean_lag_V2 + mean_lag_V3) / 2,
    mid_band = (mean_lag_V4 + mean_lag_V5 + mean_lag_V6) / 3,
    high_band = (mean_lag_V7 + mean_lag_V8) / 2
  )
model_bigs_simple <- lm(mean_weekly_max_velocity ~ low_band + mid_band + high_band, data = bigs)
summary(model_bigs_simple)


# Skill Group
skill <- skill %>%
  mutate(
    low_band = (mean_lag_V2 + mean_lag_V3 + mean_lag_V4) / 3,
    mid_band = (mean_lag_V5 + mean_lag_V6) / 2,
    high_band = (mean_lag_V7 + mean_lag_V8) / 2
  )
model_skill_simple <- lm(mean_weekly_max_velocity ~ low_band + mid_band + high_band, data = skill)
summary(model_skill_simple)

# Combo group
combo <- combo %>%
  mutate(
    low_band = (mean_lag_V2 + mean_lag_V3) / 2,
    mid_band = (mean_lag_V4 + mean_lag_V5) / 2,
    high_band = (mean_lag_V6 + mean_lag_V7 + mean_lag_V8) / 3
  )
model_combo_simple <- lm(mean_weekly_max_velocity ~ low_band + mid_band + high_band, data = combo)
summary(model_combo_simple)
```
Bigs model:
  Statistically significant overall and explains around 33% of the variation of max velocity. The low band is significantly negative. Every unit increase in the low band results in a decrease in max velocity. The mid and high bands are not significant but the high band shows a positive trend.
  Based on this model, for bigs, more low-band effort may reduce weekly top speed, possibly indicating underexposure of high speeds. High-band efforts might help, but aren’t clearly impactful in this group.

Skill model:
  Almost significant. Low band has a negative effect, mid band has a positive effect, and high band is not significant. 
  Based on this model, for skill players, more low-intensity exposure seems detrimental to max speed, while moderate band (V5–V6) exposure may enhance it.
  
Combo model:
  Model is not significant. The combo position group contains many different type of positions (QBs, RBs, TEs, LBs) with different movement and velocities. This variability may obscure relationships.
  
Across groups, low-band exposure is consistently negatively related to peak weekly speed — suggesting that too much low-intensity work may reduce the capacity to reach high speeds.

## Are relative efforts and bands more advantageous than the absolute bands provided?

Look at ID_11's plot from the last question. This player is an offensive lineman. Looking at the plot, he is mostly in band 2 and 3 while never reaching bands 7 or 8 and rarely reaching band 6. Despite this, this athlete's weekly max velocity is always at least 75% of their all-time max velocity. Only looking at the absolute bands that are provided, we might come to the conclusion that ID_11 is not achieving high running speeds, however after looking at his max velocity efforts relative to his all-time max velocity, he is reaching high running speeds.

#### Create Relative Bands for ID_11

```{r}
# Create Relative Bands for ID_11

# Prepare ID_11 dataset with relative bands
ID_11 <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>% 
  filter(anon_id == "ID_11", Maximum.Velocity != 0) %>%
  filter(!is.na(Maximum.Velocity)) %>%
  mutate(
    week = floor_date(Date, unit = "week"),
    pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100,
    # Create Relative Bands
    relative_band = cut(
      pct_of_max,
      breaks = c(20, 40, 50, 60, 70, 80, 90, 100),
      labels = c("V2 (20-40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"),
      right = TRUE, # Set to TRUE to include 100% in band
      include.lowest = TRUE  # Includes 0 in first band
    ),
    # Reverse factor levels so highest band is on top in plot
  relative_band = factor(
    relative_band,
    levels = rev(c("V2 (20-40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
  )
  )

# Summarize counts per week and relative band
weekly_effort <- ID_11 %>%
  filter(!is.na(relative_band)) %>%
  group_by(week, relative_band) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(week) %>%
  mutate(pct_effort = (count / sum(count)) * 100) %>%
  ungroup()

# Pivot wider to get percentages per band as columns
weekly_effort_wide <- weekly_effort %>%
  select(week, relative_band, pct_effort) %>%
  pivot_wider(names_from = relative_band, values_from = pct_effort, values_fill = 0)

# Calculate weekly max velocity % of player max velocity
weekly_max_velocity <- ID_11 %>%
  group_by(week) %>%
  summarise(pct_of_max_velocity = max(pct_of_max), .groups = "drop")

# Join the max velocity % back to wide effort data
weekly_effort_wide <- weekly_effort_wide %>%
  left_join(weekly_max_velocity, by = "week")

# Pivot longer for stacked bar plotting
weekly_effort_long <- weekly_effort_wide %>%
  pivot_longer(
    cols = c("V2 (20-40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"),
    names_to = "relative_band",
    values_to = "pct_effort"
  ) %>%
  mutate(
    relative_band = factor(
      relative_band,
      levels = rev(c("V2 (20-40%)", "V3 (40-50%)","V4 (50-60%)" ,"V5 (60-70%)", "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
    )
  )
```

```{r}
# Plot Relative Bands for ID_11
ggplot(weekly_effort_long, aes(x = week, y = pct_effort, fill = relative_band)) +
  geom_col(position = "stack") +
  geom_line(aes(y = pct_of_max_velocity, group = 1), 
            color = "black", size = 1.2) +
  geom_point(aes(y = pct_of_max_velocity), 
             color = "black", size = 2) +
  scale_fill_manual(
    values = c(
      "V2 (20-40%)" = "darkgreen",
      "V3 (40-50%)" = "green2",
      "V4 (50-60%)" = "greenyellow",
      "V5 (60-70%)" = "yellow",
      "V6 (70-80%)" = "orange",
      "V7 (80-90%)" = "tomato",
      "V8 (90-100%)" = "darkred"
    )
  ) +
  labs(
    title = "Relative Velocity Band Effort % per Week, ID_11",
    x = "Week",
    y = "Percent of Weekly Efforts",
    fill = "Relative Velocity Band"
  ) + 
  scale_y_continuous(
    sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
  ) +
  theme_classic()
```
Comparing this with the absolute bands graph of ID_11, we can see that this is a much better representation of their running speeds.

#### Relative Bands for each athlete

```{r}
# Function to create relative bands and plot these for each athlete

# Arrange data so that the plot will display in anon_id order
clean_anons <- Catapult_Session_clean %>%
  arrange(anon_id)

# Loop over all athletes and create their relative bands plots
unique(clean_anons$anon_id) %>%
  lapply(function(player_id) {
    # Prepare data for player
    player_data <- Catapult_Session_clean %>%
      filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
      filter(anon_id == player_id, Maximum.Velocity != 0, !is.na(Maximum.Velocity)) %>%
      mutate(
        week = floor_date(Date, unit = "week"),
        pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100,
        # Create Relative Bands
        relative_band = cut(
          pct_of_max,
          breaks = c(20, 40, 50, 60, 70, 80, 90, 100),
          labels = c("V2 (20-40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
                     "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"),
          right = TRUE, # Set to true to include 100%
          include.lowest = TRUE  # Set to true to include 0 in 1st interval
        ),
        # Reverse factor levels so highest band is on top in plot
        relative_band = factor(
          relative_band,
          levels = rev(c("V2 (20-40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
                         "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
        )
      )
    
    if (nrow(player_data) == 0) return(NULL)  # skip empty

    # Summarize counts per week and relative band
    weekly_effort <- player_data %>%
      filter(!is.na(relative_band)) %>%
      group_by(week, relative_band) %>%
      summarise(count = n(), .groups = "drop") %>%
      group_by(week) %>%
      mutate(pct_effort = (count / sum(count)) * 100) %>%
      ungroup()

    # Pivot wider to get percentages per band as columns
    weekly_effort_wide <- weekly_effort %>%
      select(week, relative_band, pct_effort) %>%
      pivot_wider(names_from = relative_band, values_from = pct_effort, values_fill = 0)

    # Calculate weekly max velocity % of player max velocity
    weekly_max_velocity <- player_data %>%
      group_by(week) %>%
      summarise(pct_of_max_velocity = max(pct_of_max), .groups = "drop")

    # Join
    weekly_effort_wide <- weekly_effort_wide %>%
      left_join(weekly_max_velocity, by = "week")
    
    # Pivot longer for stacked bar plotting
    weekly_effort_long <- weekly_effort_wide %>%
      pivot_longer(
          cols = -c(week, pct_of_max_velocity),
        names_to = "relative_band",
        values_to = "pct_effort"
      ) %>%
      mutate(
        relative_band = factor(
          relative_band,
          levels = rev(c("V2 (20-40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
                         "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)"))
        )
      )
    
    # Plot
    plot <- ggplot(weekly_effort_long, aes(x = week, y = pct_effort, fill = relative_band)) +
      geom_col(position = "stack") +
      geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
      geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
      scale_fill_manual(
        values = c(
          "V2 (20-40%)" = "darkgreen",
          "V3 (40-50%)" = "green2",
          "V4 (50-60%)" = "greenyellow",
          "V5 (60-70%)" = "yellow",
          "V6 (70-80%)" = "orange",
          "V7 (80-90%)" = "tomato",
          "V8 (90-100%)" = "darkred"
        )
      ) +
      scale_y_continuous(
        name = "Percent of Weekly Effort",
        sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
      ) +
      labs(
        title = paste("Relative Velocity Band Effort % and Max Velocity Trend for Player", player_id),
        x = "Week",
        fill = "Relative Velocity Band"
      ) +
      theme_classic()

    print(plot)
    
    return(NULL)
  })
```


```{r}
# Create relative bands for all athletes

# Define the velocity band labels
band_levels <- c("V2 (20-40%)", "V3 (40-50%)", "V4 (50-60%)", "V5 (60-70%)",
                     "V6 (70-80%)", "V7 (80-90%)", "V8 (90-100%)")

# Create exposure data frame for all players
weekly_sprint_exposure <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
  filter(Maximum.Velocity != 0, !is.na(Maximum.Velocity)) %>%
  mutate(
    week = floor_date(Date, unit = "week"),
    pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100,
    relative_band = cut(
      pct_of_max,
      breaks = c(20, 40, 50, 60, 70, 80, 90, 100.1),
      labels = band_levels,
      right = TRUE,  # will include 100
      include.lowest = TRUE
    ),
    relative_band = factor(relative_band, levels = band_levels)
  ) %>%
  filter(!is.na(relative_band)) %>%
  group_by(anon_id, week, relative_band) %>%
  summarise(effort_count = n(), .groups = "drop") %>%
  group_by(anon_id, week) %>%
  mutate(
    total_efforts = sum(effort_count),
    pct_effort = (effort_count / total_efforts) * 100
  ) %>%
  ungroup() %>%
  select(anon_id, week, relative_band, pct_effort) %>%
  pivot_wider(
    names_from = relative_band,
    values_from = pct_effort,
    values_fill = 0
  )

# Add weekly max velocity % of max
weekly_max_pct <- Catapult_Session_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01")) %>%
  filter(Maximum.Velocity != 0, !is.na(Maximum.Velocity)) %>%
  mutate(
    week = floor_date(Date, unit = "week"),
    pct_of_max = (Maximum.Velocity / Player.Max.Velocity) * 100
  ) %>%
  group_by(anon_id, week) %>%
  summarise(pct_of_max_velocity = max(pct_of_max, na.rm = TRUE), .groups = "drop")

# Final data set
relative_bands <- weekly_sprint_exposure %>%
  left_join(weekly_max_pct, by = c("anon_id", "week"))

# Convert to long for plotting
relative_bands_long <- relative_bands %>%
  pivot_longer(
    cols = starts_with("V"),
    names_to = "relative_band",
    values_to = "pct_effort"
  ) %>%
  mutate(
    relative_band = factor(relative_band, levels = band_levels)
  )
```

```{r}
COMBO <- c("QB","LB","TE","RB", "ILB")
BIG <- c("OL", "DL", "DE", "DT")
SKILL <- c("WR", "DB", "CB", "SAF")
Positions <- c("COMBO", "BIG", "SKILL")

weekly_sprint_exposure <- left_join(weekly_sprint_exposure, player_positions, by = "anon_id", relationship = "many-to-many")

weekly_sprint_exposure <- weekly_sprint_exposure %>%
  mutate(Position = case_when(Primary.Position %in% COMBO ~ "COMBO",
                               Primary.Position %in% BIG ~ "BIG",
                               Primary.Position %in% SKILL ~ "SKILL"))

weekly_sprint_exposure <- distinct(weekly_sprint_exposure)

max_velocities <- weekly_velocity_efforts[,c("anon_id","week","pct_of_max_velocity")]

weekly_sprint_exposure <- left_join(weekly_sprint_exposure, max_velocities, by = c("anon_id", "week"), relationship="many-to-many")




sprint_exposure_big <- weekly_sprint_exposure %>%
  filter(Position == "BIG") %>%
  group_by(week) %>%
  mutate(avg_pct_max = mean(pct_of_max_velocity),
         avg_V8 = mean(`V8 (90-100%)`),
         avg_V7 = mean(`V7 (80-90%)`),
         avg_V6 = mean(`V6 (70-80%)`),
         avg_V5 = mean(`V5 (60-70%)`),
         avg_V4 = mean(`V4 (50-60%)`),
         avg_V3 = mean(`V3 (40-50%)`),
         avg_V2 = mean(`V2 (20-40%)`)) %>%
  ungroup() %>%
  na.omit()

sprint_exposure_combo <- weekly_sprint_exposure %>%
  filter(Position == "COMBO")  %>%
  group_by(week) %>%
  mutate(avg_pct_max = mean(pct_of_max_velocity),
         avg_V8 = mean(`V8 (90-100%)`),
         avg_V7 = mean(`V7 (80-90%)`),
         avg_V6 = mean(`V6 (70-80%)`),
         avg_V5 = mean(`V5 (60-70%)`),
         avg_V4 = mean(`V4 (50-60%)`),
         avg_V3 = mean(`V3 (40-50%)`),
         avg_V2 = mean(`V2 (20-40%)`)) %>%
  ungroup() %>%
  na.omit()

sprint_exposure_skill <- weekly_sprint_exposure %>%
  filter(Position == "SKILL")  %>%
  group_by(week) %>%
  mutate(avg_pct_max = mean(pct_of_max_velocity),
         avg_V8 = mean(`V8 (90-100%)`),
         avg_V7 = mean(`V7 (80-90%)`),
         avg_V6 = mean(`V6 (70-80%)`),
         avg_V5 = mean(`V5 (60-70%)`),
         avg_V4 = mean(`V4 (50-60%)`),
         avg_V3 = mean(`V3 (40-50%)`),
         avg_V2 = mean(`V2 (20-40%)`)) %>%
  ungroup() %>%
  na.omit()
  
```

When I built the models that considered the relative bands instead of the absolute bands provided, I found that initially, the results were a lot better than the previous models with the absolute bands. This suggests that the relative bounds which are segmented into 10% chunks of all time maximum velocity for each player is a lot more indicative of effort and therefore their percentage of maximum velocity in a given week. We are able to see though that there is a lot of multicollinearity within the relative bands calculated. 

```{r}
# Modeling

# Bigs Model
model_big <- lm(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
                data=sprint_exposure_big)
summary(model_big)


# Skill Model
model_skill <- lm(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
                data=sprint_exposure_skill)
summary(model_skill)


# Combo Model
model_combo <- lm(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
               data=sprint_exposure_combo)
summary(model_combo)

```

Looking at the correlations between all of the predictors as well as the response variable in the model we can see that a lot of the predictors have super strong correlations which each other, some more so than they are with the response. This suggests that the issue with the models above is that multicollinearity is bogging them down and we are not seeing the true relationships between the predictors and the response. 

```{r Looking at collinear predictors}
cor(sprint_exposure_big[,13:20])
cor(sprint_exposure_skill[,13:20])
cor(sprint_exposure_combo[,13:20])
```

All of the models below are the best 3 predictor models that resulted for each position after running the best subsets algorithm. All of the following have reductions in adjusted-$R^2$ but they don't seem significant considering that we took out over half of the predictors and maintained a relatively similar adjusted-$R^2$ value. 

```{r running best subsets on the models to limit collinearity}
#Bigs

#best_sub_big <- regsubsets(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
#                data=sprint_exposure_big, method="exhaustive")
#summary(best_sub_big)

model_big <- lm(avg_pct_max~avg_V2+avg_V6+avg_V7,
                data=sprint_exposure_big)
summary(model_big)


#Skills

#best_sub_skill <- regsubsets(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
#                data=sprint_exposure_skill, method="exhaustive")
#summary(best_sub_skill)

model_skill <- lm(avg_pct_max~avg_V2+avg_V5+avg_V6,
                data=sprint_exposure_skill)
summary(model_skill)

#Combos
#best_sub_combo <- regsubsets(avg_pct_max~avg_V2+avg_V3+avg_V4+avg_V5+avg_V6+avg_V7+avg_V8,
#                data=sprint_exposure_combo, method="exhaustive")
#summary(best_sub_combo)

model_combo <- lm(avg_pct_max~avg_V2+avg_V4+avg_V5,
               data=sprint_exposure_combo)
summary(model_combo)
```
Since the models perform roughly the same with just 3 predictors as they do with all 7, it may make sense to instead truncate into more general bins. This may help us understand the relationship with low, medium and high effort with weekly maximum velocity. 

```{r}
sprint_exposure_big <- sprint_exposure_big %>%
  mutate(avg_low = (avg_V2+avg_V3)/2,
         avg_medium = (avg_V4+avg_V5+avg_V6)/3,
         avg_high = (avg_V7+avg_V8)/2)

sprint_exposure_skill <- sprint_exposure_skill %>%
  mutate(avg_low = (avg_V2+avg_V3)/2,
         avg_medium = (avg_V4+avg_V5+avg_V6)/3,
         avg_high = (avg_V7+avg_V8)/2)

sprint_exposure_combo <- sprint_exposure_combo %>%
  mutate(avg_low = (avg_V2+avg_V3)/2,
         avg_medium = (avg_V4+avg_V5+avg_V6)/3,
         avg_high = (avg_V7+avg_V8)/2)
```


```{r}
#bigs
model_big_general <- lm(avg_pct_max~avg_low+avg_medium+avg_high,
                        data=sprint_exposure_big)
summary(model_big_general)

#skills
model_skill_general <- lm(avg_pct_max~avg_low+avg_medium+avg_high,
                        data=sprint_exposure_skill)
summary(model_skill_general)

#combos
model_combo_general <- lm(avg_pct_max~avg_low+avg_medium+avg_high,
                        data=sprint_exposure_combo)
summary(model_combo_general)
```
Still pretty bad, I am going to use principle components regression to see if there are other relationships we are missing

```{r}
preds <- sprint_exposure_big[,14:20]

pc_loadings <- prcomp(preds, scale=TRUE)$rotation[,c(1,2,3)]
pc_loadings
```


## How does sprinting exposure (# of efforts, % max reached) relate to incidence of hamstring injuries?

```{r}
# Join relative_bands data set with injury data

Incident_dates <- Incident_Report_clean %>%
  filter(Date >= as.Date("2024-06-30") & Date <= as.Date("2025-07-01"))

# Get ids of athletes with injury
injured_ids <- unique(Incident_dates$anon_id)

# Filter relative_bands to only include athletes who got injured
injured_data <- relative_bands_long %>%
  filter(anon_id %in% injured_ids)

# Create injury weeks
injury_weeks <- Incident_Report_clean %>%
  mutate(
    week = floor_date(as.Date(Date.of.Injury), unit = "week")
  ) %>%
  filter(Date.of.Injury >= as.Date("2024-06-30") & Date.of.Injury <= as.Date("2025-07-01")) %>%
  select(anon_id, week) %>%
  distinct() %>%
  mutate(injury = 1)

injuries_and_running <- injured_data %>%
  left_join(injury_weeks, by = c("anon_id", "week")) %>%
  mutate(injury = ifelse(is.na(injury), 0, injury))
```


```{r}

injuries_and_running <- injuries_and_running %>%
  mutate(relative_band = factor(relative_band, levels = band_levels))


# Filter to only include players 194 and 285
plot_data <- injuries_and_running %>%
  filter(anon_id %in% c("ID_194", "ID_285"))

# Identify injury weeks for shading
injury_shading <- plot_data %>%
  filter(injury == 1) %>%
  mutate(
    xmin = week - 3,
    xmax = week + 3,
    ymin = -Inf,
    ymax = Inf
  )

# Create a single faceted plot
ggplot(plot_data, aes(x = week)) +
  geom_rect(data = injury_shading,
            aes(xmin = xmin, xmax = xmax, ymin = ymin, ymax = ymax),
            fill = "blue", alpha = 0.2, inherit.aes = FALSE) +
  geom_col(aes(y = pct_effort, fill = relative_band), position = position_stack(reverse = TRUE)) +
  geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
  geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
  facet_wrap(~ anon_id, scales = "free_x") +
  scale_fill_manual(
    values = c(
      "V2 (20-40%)" = "darkgreen",
      "V3 (40-50%)" = "green2",
      "V4 (50-60%)" = "greenyellow",
      "V5 (60-70%)" = "yellow",
      "V6 (70-80%)" = "orange",
      "V7 (80-90%)" = "tomato",
      "V8 (90-100%)" = "darkred"
    )
  ) +
  scale_y_continuous(
    name = "Percent of Weekly Effort",
    sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
  ) +
  labs(
    title = "Velocity Band Exposure & Max % for Injured Players 194 and 285",
    x = "Week",
    fill = "Relative Velocity Band"
  ) +
  theme_classic()
```


```{r}
# Loop over injured athletes and create their plots

unique(injuries_and_running$anon_id) %>%
  lapply(function(player_id) {
    
    # Filter data for this player
    plot_data <- injuries_and_running %>%
      filter(anon_id == player_id)
    
    if (nrow(plot_data) == 0) return(NULL)  # Skip if no data
    
    # Get injury weeks
    injury_week_nums <- plot_data %>%
      filter(injury == 1) %>%
      pull(week)
    
    # Plot
    plot <- ggplot(plot_data, aes(x = week)) +
      # Highlight injury weeks with shaded blue areas
      geom_rect(data = data.frame(week = injury_week_nums),
                aes(xmin = week - 3, xmax = week + 3, ymin = -Inf, ymax = Inf),
                fill = "blue", alpha = 0.2, inherit.aes = FALSE) +
      geom_col(aes(y = pct_effort, fill = relative_band), position = position_stack(reverse = TRUE)) +
      geom_line(aes(y = pct_of_max_velocity, group = 1), color = "black", size = 1.2) +
      geom_point(aes(y = pct_of_max_velocity), color = "black", size = 2) +
      scale_fill_manual(
        values = c(
          "V2 (20-40%)" = "darkgreen",
          "V3 (40-50%)" = "green2",
          "V4 (50-60%)" = "greenyellow",
          "V5 (60-70%)" = "yellow",
          "V6 (70-80%)" = "orange",
          "V7 (80-90%)" = "tomato",
          "V8 (90-100%)" = "darkred"
        )
      ) +
      scale_y_continuous(
        name = "Percent of Weekly Effort",
        sec.axis = sec_axis(~ ., name = "Percent of Max Velocity")
      ) +
      labs(
        title = paste("Velocity Band Exposure & Max % for Injured Player", player_id),
        x = "Week",
        fill = "Relative Velocity Band"
      ) +
      theme_classic()
    
    print(plot)
    
    return(NULL)
  })
```
The blue lines indicate a week where an injury occurred.
No real insights can be made from looking at relative velocity bands and injury occurrances. 

```{r}
model_injury <- glm(injury ~ pct_of_max_velocity,
                    data = injuries_and_running,
                    family = "binomial")

summary(model_injury)
```
Simple model
Each 1 percentage point increase in pct_of_max_velocity increases the log-odds of injury by 0.00923.
Not significant.


```{r}
# Get summary stats for both injury and non injury

# Clean and get date format
injuries_and_running_clean <- injuries_and_running %>%
  mutate(week_formatted = format(as.Date(week), "%m-%d-%Y")) %>%
  distinct(anon_id, week, .keep_all = TRUE)

# Filter for only injury weeks
injury_weeks <- injuries_and_running_clean %>%
  filter(injury == 1) %>%
  mutate(injury_event = paste0(anon_id, "__", week_formatted))

# Calculate statistics for injury weeks
injury_mean <- mean(injury_weeks$pct_of_max_velocity, na.rm = TRUE)

injury_summary_stats <- injury_weeks %>%
  summarise(
    mean_pct = mean(pct_of_max_velocity, na.rm = TRUE),
    sd_pct = sd(pct_of_max_velocity, na.rm = TRUE),
    n = sum(!is.na(pct_of_max_velocity))
  )
# Confidence interval 
se <- injury_summary_stats$sd_pct / sqrt(injury_summary_stats$n)
t_crit <- qt(0.975, df = injury_summary_stats$n - 1)
lower <- injury_summary_stats$mean_pct - t_crit * se
upper <- injury_summary_stats$mean_pct + t_crit * se
# Print
cat("95% CI for mean pct_of_max_velocity:", round(lower,2), "-", round(upper,2))


# Filter for non-injury weeks
non_injury_weeks <- injuries_and_running_clean %>%
  filter(injury == 0)

# Get statistics for non-injury weeks
non_injury_mean <- mean(non_injury_weeks$pct_of_max_velocity, na.rm = TRUE)

non_injury_summary_stats <- non_injury_weeks %>%
  summarise(
    mean_pct = mean(pct_of_max_velocity, na.rm = TRUE),
    sd_pct = sd(pct_of_max_velocity, na.rm = TRUE),
    n = sum(!is.na(pct_of_max_velocity))
  )

# Confidence interval
non_injury_se <- non_injury_summary_stats$sd_pct / sqrt(non_injury_summary_stats$n)
non_injury_t_crit <- qt(0.975, df = non_injury_summary_stats$n - 1)
non_injury_lower <- non_injury_summary_stats$mean_pct - non_injury_t_crit * non_injury_se
non_injury_upper <- non_injury_summary_stats$mean_pct + non_injury_t_crit * non_injury_se

# Print results
cat("95% CI for mean pct_of_max_velocity (non-injury weeks):", 
    round(non_injury_lower, 2), "-", round(non_injury_upper, 2))
```

```{r}
# T-test to compare pct_of_max_velocity in injuries and non injuries that week
t_test <- t.test(
  pct_of_max_velocity ~ injury, 
  data = injuries_and_running_clean,
  var.equal = FALSE # use TRUE if you assume equal variances
)

# Print results
print(t_test)
```
Group 1 (injury) mean = 86.55
Group 2 (non-injury) mean = 85.74
While the mean % of max velocity is higher for the injury group, the difference is small and not statistically significant. (high p-value, 0.69)

Plots of Percent of Max Velocity in that week
```{r}  
# Plot of % Max Velocity during injury and non injury weeks
ggplot(injuries_and_running_clean, aes(x = week, y = pct_of_max_velocity)) +
  geom_point(aes(color = factor(injury))) +
  scale_color_manual(values = c("0" = "black", "1" = "red"),
                     labels = c("No Injury", "Injury"),
                     name = "Injury Status") +
  labs(
    title = "Percent of Max Velocity by Week",
    subtitle = paste("Mean % of Max Velocity, Injury: ", round(injury_mean, 2), " |  Mean % of Max Velocity, Non-Injury: ", round(non_injury_mean, 2)),
    x = "Week",
    y = "% of Max Velocity Reached"
  ) +
  theme_classic()


ggplot(injury_weeks, aes(x = week, y = pct_of_max_velocity)) +
  geom_point(color = "red") +
  labs(
    title = "Percent of Max Velocity During Injury Weeks",
    subtitle = paste("% of Max Velocity Mean: ", round(injury_mean, 2)),
    x = "Week",
    y = "% of Max Velocity"
  ) +
  theme_classic()

# Create bar chart
ggplot(injury_weeks, aes(x = factor(injury_event), y = pct_of_max_velocity)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = round(pct_of_max_velocity, 1)),  # round to 1 decimal place
            vjust = -0.5, size = 2.75) +
  labs(
    title = "Percent of Max Velocity During Injury Weeks",
    subtitle = paste("Mean: ", round(injury_mean, 2)),
    x = "Injury Event",
    y = "% of Max Velocity"
  ) +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```


```{r}
# Looking into weeks before injury

# Calculate percent of max velocity from 1, 2, and 3 weeks prior as well as the sum of weeks 1 and 2 and the sum of weeks 1, 2, and 3 as well as the change between week one and week 2
week_prior <- injuries_and_running_clean %>%
  group_by(anon_id) %>%
  arrange(week) %>%
  mutate(
    # Previous weeks values of % of max velocity
    lag1_pct_of_max_velocity = lag(pct_of_max_velocity, 1),
    lag2_pct_of_max_velocity = lag(pct_of_max_velocity, 2),
    lag3_pct_of_max_velocity = lag(pct_of_max_velocity, 3),
    
    # Change of % of max from current week to previous weeks
    change_lastweek = pct_of_max_velocity - lag1_pct_of_max_velocity,
    change_last2weeks = pct_of_max_velocity - lag2_pct_of_max_velocity,
    change_last3weeks = pct_of_max_velocity - lag3_pct_of_max_velocity,
    
    # Sum % of max of current to previous weeks
    sum_lastweek = pct_of_max_velocity + lag1_pct_of_max_velocity,
    sum_last2weeks = pct_of_max_velocity + lag1_pct_of_max_velocity + lag2_pct_of_max_velocity,
    sum_last3weeks = pct_of_max_velocity + lag1_pct_of_max_velocity + lag2_pct_of_max_velocity + lag3_pct_of_max_velocity,
    
    # Change between weeks (not current week)
    change_weeks1_2 = lag1_pct_of_max_velocity - lag2_pct_of_max_velocity,
    change_weeks1_3 = lag1_pct_of_max_velocity - lag3_pct_of_max_velocity,
    
    # Sum of weeks (not current week)
    sum_weeks1_2 = lag1_pct_of_max_velocity + lag2_pct_of_max_velocity,
    sum_weeks1_2_3 = lag1_pct_of_max_velocity + lag2_pct_of_max_velocity + lag3_pct_of_max_velocity
    ) %>%
  ungroup()

# Get Injury Event variable and Injury dataset
injury_week_prior <- week_prior %>%
  filter(injury == 1) %>%
  mutate(injury_event = paste0(anon_id, "__", week_formatted))

# Get non-injury dataset
non_injury_week_prior <- week_prior %>%
  filter(injury == 0)

# Get Means
means_injury <- injury_week_prior %>%
  drop_na(lag1_pct_of_max_velocity)
injury_week_prior_mean <- mean(means_injury$lag1_pct_of_max_velocity)
mean_noninjury <- non_injury_week_prior %>%
  drop_na(lag1_pct_of_max_velocity)
non_injury_week_prior_mean <- mean(mean_noninjury$lag1_pct_of_max_velocity)
injury_week_prior_mean
non_injury_week_prior_mean



# Plot of % Max Velocity during injury and non injury weeks
ggplot(week_prior, aes(x = week, y = lag1_pct_of_max_velocity)) +
  geom_point(aes(color = factor(injury))) +
  scale_color_manual(values = c("0" = "black", "1" = "red"),
                     labels = c("No Injury", "Injury"),
                     name = "Injury Status") +
  labs(
    title = "Percent of Max Velocity of Prior Week",
    subtitle = paste("Mean % of Max Velocity, Injury: ", round(injury_week_prior_mean, 2), " |  Mean % of Max Velocity, Non-Injury: ", round(non_injury_week_prior_mean, 2)),
    x = "Week",
    y = "% of Max Velocity Reached"
  ) +
  theme_classic()


ggplot(injury_week_prior, aes(x = week, y = lag1_pct_of_max_velocity)) +
  geom_point(color = "red") +
  labs(
    title = "Percent of Max Velocity the week before injury",
    subtitle = paste("% of Max Velocity Mean: ", round(injury_week_prior_mean, 2)),
    x = "Week",
    y = "% of Max Velocity"
  ) +
  theme_classic()

# Bar chart for week prior to injury % Max
ggplot(injury_week_prior, aes(x = factor(injury_event), y = lag1_pct_of_max_velocity)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = round(lag1_pct_of_max_velocity, 1)),  # round to 1 decimal place
            vjust = -0.5, size = 2.75) +
  labs(
    title = "Percent of Max Velocity the Week Before Injury",
    subtitle = paste("% Max Mean: ", round(injury_week_prior_mean, 2)),
    x = "Injury Event",
    y = "% of Max Velocity"
  ) +
  theme_classic() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
```

```{r}
# T-test to compare lag_pct_of_max_velocity (max velocity the week before) between injury and non injury
t_test_week_prior <- t.test(
  lag1_pct_of_max_velocity ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_week_prior)

# T-test to compare change between current week and last week
t_test_change_lastweek <- t.test(
  change_lastweek ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_change_lastweek)

# T-test to compare change between current week and 2 weeks ago
t_test_change_last2weeks <- t.test(
  change_last2weeks ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_change_last2weeks)

# T-test to compare change between current week and 3 weeks ago
t_test_change_last3weeks <- t.test(
  change_last3weeks ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_change_last3weeks)


# T-test to compare sum of current week, 1, 2, and 3 weeks before
t_test_sum_last3weeks <- t.test(
  sum_last3weeks ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_sum_last3weeks)
```
Mean % of max week before: injury = 86.6, non-injury = 86.2.
Not sig, pvalue = 0.87

Mean % change week before (current week % - last week %): injury = -0.057, non-injury = -0.50
Not sig, p-value = 0.89

Mean % change of last 2 weeks (current - 2 weeks ago): injury = -2.68, non-injury = -0.66
Not sig, p-value = 0.44

Mean % change of last 3 weeks (current - 3 weeks ago): injury = -0.60, non-injury = -0.81.
Not sig, p-value = 0.95

Sum of current, 1, 2, 3 weeks before:
injury = 347.86, non-injury = 343.51
Not sig, p-value = 0.41

```{r}
# Look into the last two weeks

# Drop NAs
lag_2_injury <- injury_week_prior %>%
  drop_na(lag2_pct_of_max_velocity)
lag_2_non_injury <- non_injury_week_prior %>%
  drop_na(lag2_pct_of_max_velocity)

# Calculate means
mean_lag2_injury <- mean(lag_2_injury$lag2_pct_of_max_velocity)
mean_lag2_non_injury <- mean(lag_2_non_injury$lag2_pct_of_max_velocity)
mean_lag2_injury
mean_lag2_non_injury

# Two weeks before:
# T-test to compare lag2_pct_of_max_velocity (2 weeks before) between injury and non injury
t_test_2week_prior <- t.test(
  lag2_pct_of_max_velocity ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_2week_prior)

# Change of last two weeks (lag1 - lag2)
t_test_change_weeks1_2 <- t.test(
  change_weeks1_2 ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_change_weeks1_2)

# Sum of weeks 1 and 2
t_test_sum_weeks1_2 <- t.test(
  sum_weeks1_2 ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_sum_weeks1_2)


# 3 weeks before
# T-test to compare lag3_pct_of_max_velocity (3 weeks before) between injury and non injury
t_test_3week_prior <- t.test(
  lag3_pct_of_max_velocity ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_3week_prior)

# Change between 1 weeks before and 3 weeks before
t_test_change_weeks1_3 <- t.test(
  change_weeks1_3 ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_change_weeks1_3)

# Sum of 1, 2, and 3 weeks before injury
t_test_sum_weeks1_2_3 <- t.test(
  sum_weeks1_2_3 ~ injury, 
  data = week_prior,
  var.equal = FALSE)
print(t_test_sum_weeks1_2_3)
```
2 weeks before injury:
Means of % of max: injury = 88.7, non-injury = 85.8
Not a significant difference in means, p-value, 0.1833 > 0.05.

Means of change: injury = -2.34, non-injury = 0.108
Not significant (p-value = 0.47)

Mean of sum of weeks 1 and 2: injury = 175.14, non-injury = 171.72
Not significant, p-value = 0.333


3 weeks before injury:
Means of % of max: injury = 86.7, non-injury = 85.9
Not a significant difference in means, high p-value, 0.7

Mean sum of 1, 2, and 3 weeks before injury: injury = 261.8, non-injury = 257.3
Not sig, p-value = 0.36

Mean change between 1 and 3 weeks before: injury = -0.26, non-injury = -0.27
Not sig, p-val = 0.99

```{r}
model <- glm(injury ~ change_lastweek + change_last2weeks,
             data = week_prior, family = "binomial")
summary(model)
```
There is no clear evidence that % of max velocity in the current or prior weeks (or their changes/sums) differs between injury and non-injury weeks, based on t-tests of mean differences.

##### Comparing the number of >90% efforts with injury

```{r}
# Convert 90 percent max to logical
# Instead of Yes/No it is True/False
catapult_weekly <- Catapult_Session_clean %>%
  mutate(
    Date = as.Date(Date),
    week = floor_date(Date, unit = "week"),
    Hit90 = Hit.90.Percent.Max == "Yes"  # Convert to logical
  ) %>%
  distinct(anon_id, Date, .keep_all = TRUE)


# Get all combinations of player and week
all_combinations <- catapult_weekly %>%
  distinct(anon_id, week)

# Count weekly >90% sprint hits
weekly_hits <- catapult_weekly %>%
  group_by(anon_id, week) %>%
  summarise(
    count_90pct = sum(Hit90, na.rm = TRUE),
    .groups = "drop"
  )

# Join with all combinations to include zero counts
weekly_90pct <- all_combinations %>%
  left_join(weekly_hits, by = c("anon_id", "week")) %>%
  mutate(count_90pct = replace_na(count_90pct, 0))

# See breakdown of counts of # of times >90
distrubution <- weekly_90pct %>%
  count(count_90pct) %>%
  arrange(desc(n))
```

```{r}
# Create injury weeks
injury_weeks <- Incident_Report_clean %>%
  mutate(
    week = floor_date(as.Date(Date.of.Injury), unit = "week")
  ) %>%
  filter(Date.of.Injury >= as.Date("2024-06-30") & Date.of.Injury <= as.Date("2025-07-01")) %>%
  select(anon_id, week) %>%
  distinct() %>%
  mutate(injury = 1)

# Merge with injury data to have column that indicates injury week
weekly_90pct_injuries <- weekly_90pct %>%
  left_join(injury_weeks, by = c("anon_id", "week")) %>%
  mutate(injury = replace_na(injury, 0))

# Calculate lagged >90% effort counts
weekly_90pct_injuries <- weekly_90pct_injuries %>%
  arrange(anon_id, week) %>%
  group_by(anon_id) %>%
  mutate(
    lag1_90pct_count = lag(count_90pct, 1),
    lag2_90pct_count = lag(count_90pct, 2),
    lag3_90pct_count = lag(count_90pct, 3),
    
    change_lastweek = count_90pct - lag1_90pct_count,
    change_last2weeks = count_90pct - lag2_90pct_count,
    change_last3weeks = count_90pct - lag3_90pct_count,
    
    sum_last2weeks = count_90pct + lag1_90pct_count,
    sum_last3weeks = count_90pct + lag1_90pct_count + lag2_90pct_count,
    
    avg_lastweek = (count_90pct + lag1_90pct_count) / 2,
    avg_last2weeks = (count_90pct + lag1_90pct_count + lag2_90pct_count) / 3,
    avg_last3weeks = (count_90pct + lag1_90pct_count + lag2_90pct_count + lag3_90pct_count) / 4
  ) %>%
  ungroup()
```

```{r}
# Bar chart of sprint counts the week of an injury

# Only injuries
injury_summary <- weekly_90pct_injuries %>%
  group_by(count_90pct) %>%
  summarise(
    total_injuries = sum(injury),
    total_weeks = n()
  )

ggplot(injury_summary, aes(x = factor(count_90pct), y = total_injuries)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = total_injuries), vjust = -0.3, size = 4) +
  labs(
    x = "Weekly >90% Sprint Count",
    y = "Total Injuries",
    title = "Injuries by >90% Sprint Count in the Week of Injury"
  ) +
  theme_classic()
```

```{r}
# Injury rate table for week of injury counts
injury_rate_table <- weekly_90pct_injuries %>%
  group_by(count_90pct) %>%
  summarise(
    total_weeks = n(),                           # Total weeks with this count
    injury_weeks = sum(injury == 1),             # How many had injury that week
    injury_rate = injury_weeks / total_weeks    # Proportion injured
  ) %>%
  arrange(desc(injury_rate))
injury_rate_table
```

```{r}
# Create a contingency table manually
injury_table <- matrix(c(
  10, 1343,   # Low: injured, not injured
  6, 395,     # Moderate: injured, not injured
  0, 78      # High: injured, not injured
), nrow = 3, byrow = TRUE)

# Add row and column names for clarity
rownames(injury_table) <- c("Low", "Moderate", "High")
colnames(injury_table) <- c("Injured", "Not_Injured")

# Run Fisher's Exact Test
fisher_result <- fisher.test(injury_table)

# View the result
print(fisher_result)
```
Our p-value of 0.33 means that there is no statically significance association between our sprint load groups and injury. 
Injury events are pretty rare overall in our dataset, making it hard to detect subtle differences.


```{r}
# Summarize injuries and total weeks by total sprint counts over last 2 weeks
injury_summary_2week <- weekly_90pct_injuries %>%
  mutate(sum_last2weeks = replace_na(sum_last2weeks, 0)) %>%
  group_by(sum_last2weeks) %>%
  summarise(
    total_injuries = sum(injury == 1),
    total_weeks = n(),
    injury_rate = (total_injuries / total_weeks) * 100
  ) %>%
  arrange(desc(injury_rate))
injury_summary_2week

# Plot total injuries by total sprint counts over last 2 weeks
ggplot(injury_summary_2week, aes(x = factor(sum_last2weeks), y = total_injuries)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = total_injuries), vjust = -0.3, size = 4) +
  labs(
    x = "Total >90% Sprint Counts Over Last 2 Weeks",
    y = "Total Injuries",
    title = "Injuries by Total >90% Sprint Counts Over the Last Two Weeks"
  ) +
  theme_classic()

# Plot injury rate
ggplot(injury_summary_2week, aes(x=factor(sum_last2weeks), y = injury_rate)) +
  geom_col(fill = "#CFB87C") +
  geom_text(aes(label = round(injury_rate, 3)), vjust = -0.3, size = 4) +
  labs(
    x = "Total >90% Sprint Counts Over Last 2 Weeks",
    y = "Injury Rate",
    title = "Injurie Rate by Total >90% Sprint Counts Over the Last Two Weeks"
  ) +
  theme_classic()
```

```{r}
# Table for counts over last few weeks
weekly_90pct_injuries <- weekly_90pct_injuries %>%
  mutate(avg2_group = round(avg_last2weeks, 1))

# Create binned groups: Low (0–1), Moderate (1–2), High (2+)
weekly_90pct_injuries <- weekly_90pct_injuries %>%
  mutate(load_group = case_when(
    avg_last2weeks < 1 ~ "Low (<1)",
    avg_last2weeks >= 1 & avg_last2weeks < 2 ~ "Moderate(1-2)",
    avg_last2weeks >= 2 ~ "High (2+)"
  ))

# Create a table of counts and injury rates by group
injury_rate_by_group <- weekly_90pct_injuries %>%
  group_by(load_group) %>%
  summarise(
    total_weeks = n(),
    injury_weeks = sum(injury == 1),
    injury_rate = injury_weeks / total_weeks
  ) %>%
  arrange(load_group)
injury_rate_by_group

# Create a table of counts and injury rates
injury_rate_2weekavg <- weekly_90pct_injuries %>%
  group_by(avg2_group) %>%
  summarise(
    total_weeks = n(),
    injuries = sum(injury == 1),
    injury_rate = (injuries / total_weeks) * 100
  ) %>%
  arrange(desc(injury_rate))
injury_rate_2weekavg
```
Current week counts:
Injuries increase slightly from 0-3 counts with 3 having the highest injury rate of 2.0%. 4 and 5 counts have no injuries but a low sample size.

Counts over last few weeks (Grouped):
High counts had no injuries
Moderate counts had the highest injury rate (1.52%)
Low counts had a injury rate of 0.74%

Injury rates increased from low counts (<1) injury rate of 0.74% to moderate counts (1-2) injury rate of 1.52% then dropped to 0% for high counts (2+). However, high counts have a small sample size (78), so the rate may be unstable.

```{r}
weekly_90pct_injuries <- weekly_90pct_injuries %>%
  mutate(workload_group = case_when(
    avg2_group < 0.5 ~ "Low (<0.5)",
    avg2_group >= 0.5 ~ "High (>0.5)",
    TRUE ~ NA_character_
  ))

injury_rate_grouped <- weekly_90pct_injuries %>%
  group_by(workload_group) %>%
  summarise(
    total_weeks = n(),
    injuries = sum(injury == 1, na.rm = TRUE),
    injury_rate = (injuries / total_weeks) * 100
  )
injury_rate_grouped


```

Average over the last three weeks. The injury rate is higher when athletes average over 0.5 over 90 percent max sprint counts. 

```{r}
# Make contingency table
contingency <- matrix(c(8, 716, 8, 1084), nrow = 2, byrow = TRUE)
colnames(contingency) <- c("Injury", "NoInjury")
rownames(contingency) <- c("High", "Low")

# Run test
chisq.test(contingency)
```
Since our p-value of 0.56>0.05, there is no statistically significant difference in injury rates between the High and Low >90 count groups.
The chi-square test of the injury rates of average >90 counts of the last 3 weeks (either high > 0.5, or low < 0.5) showed no significant difference in injury rates between these groups.

It is important to note that the number of injury events observed in the dataset is relatively low. As a result, the statistical power to detect significant differences in injury rates between groups is limited. This low event count increases the likelihood of a Type II error (failing to detect a true effect), and thus, non-significant results should be interpreted with caution. Future studies with larger sample sizes or more injury occurrences are needed to better assess the relationship between workload and injury risk.

```{r}
# T-tests:

# Current week
t.test(count_90pct ~ injury, data = weekly_90pct_injuries, var.equal = FALSE)

# Week before
t.test(lag1_90pct_count ~ injury, data = weekly_90pct_injuries, var.equal = FALSE)

# Change last week to this week
t.test(change_lastweek ~ injury, data = weekly_90pct_injuries, var.equal = FALSE)

# Change last two weeks
t.test(change_last2weeks ~ injury, data = weekly_90pct_injuries, var.equal = FALSE)

# Sum over last 2 weeks
t.test(sum_last2weeks ~ injury, data = weekly_90pct_injuries, var.equal = FALSE)
```

```{r}
weekly_90pct_injuries <- weekly_90pct_injuries %>%
  mutate(
    sprint_exposure_bin = ifelse(count_90pct > 0, "Some", "None")
  )

table_exposure <- table(weekly_90pct_injuries$sprint_exposure_bin, weekly_90pct_injuries$injury)

# Fisher's Exact Test
fisher.test(table_exposure)
```

```{r}
# Create top quartile exposure
cutoff <- quantile(weekly_90pct_injuries$count_90pct, 0.75, na.rm = TRUE)

weekly_90pct_injuries <- weekly_90pct_injuries %>%
  mutate(
    sprint_exposure_group = ifelse(count_90pct >= cutoff, "High", "Low")
  )

table_quartile <- table(weekly_90pct_injuries$sprint_exposure_group, weekly_90pct_injuries$injury)

fisher.test(table_quartile)
```


```{r}
glm(injury ~ count_90pct, data = weekly_90pct_injuries, family = "binomial")

glm(injury ~ lag1_90pct_count, data = weekly_90pct_injuries, family = "binomial")

change <- glm(injury ~ change_lastweek, data = weekly_90pct_injuries, family = "binomial")
summary(change)
glm(injury ~ change_last2weeks, data = weekly_90pct_injuries, family = "binomial")


sum <- glm(injury ~ sum_last3weeks, data = weekly_90pct_injuries, family = "binomial")
summary(sum)
```

```{r}
multi_model <- glm(
  injury ~ count_90pct + sum_last2weeks,
  data = weekly_90pct_injuries,
  family = "binomial"
)

summary(multi_model)
```


```{r}
ggplot(weekly_90pct_injuries, aes(x = change_lastweek, y = injury)) +
  geom_jitter(height = 0.05, alpha = 0.3, color = "blue") +  # Jitter to spread points vertically
  geom_smooth(method = "glm", method.args = list(family = "binomial"), se = TRUE, color = "red") +  # Logistic regression curve
  labs(
    title = "Probability of Injury vs Change in >90% Sprint Counts (Last Week)",
    x = "Change in >90% Sprint Counts (Current Week - Previous Week)",
    y = "Injury (0 = No, 1 = Yes)"
  ) +
  theme_minimal()
```

No t-tests or models revealed any insights. 







```{r removing unnecessary objects for running imbalance analysis}
#removing extra data
remove(band_long, bigs, clean_anons, combo, cor_data, cor_matrix, daily_90_counts,
       full_grid, hit_90_counts, ID_11, Incident_dates, injured_data, injuries_and_running,
       injury_by_group, injury_weeks, max_velocities, model_all_bands, model_big,
       model_bigs, model_combo, model_skill, player_counts_long, player_data,
       player_positions, plot_data, plots_by_position, position_averages,
       position_averages_with_team, pre_injury_weeks, QBs, relative_bands,
       relative_bands_long, skill, sprint_counts, sprint_exposure_big,
       sprint_exposure_combo, sprint_exposure_skill, sprint_injury_table, V8, 
       weekly_90_counts, weekly_band_effort_by_group, weekly_effort, weekly_effort_long,
       weekly_effort_wide, weekly_max_pct, weekly_max_velocity, weekly_sprint_counts,
       weekly_sprint_counts_lagged, weekly_sprint_exposure, weekly_velocity_efforts)

#removing extra values and functions
remove(all_athletes, all_weeks, band_levels, BIG, COMBO, SKILL, injured_ids,
       injury_week_nums, overall_avg, player_id, positions, Positions, qb_avg, team_avg,
       plot_hit_90_by_position)
```


## Maximum Velocity Trends over Time
```{r}
catapult_ids <- unique(Catapult_Session_clean$anon_id)

Catapult_Session_clean <- Catapult_Session_clean %>%
  group_by(anon_id) %>%
  mutate(week = floor_date(Date, unit="week")) %>%
  ungroup() %>%
  group_by(anon_id, week) %>%
  mutate(Week.Max.Velocity = mean(na.omit(Session.Max.Velocity)))

kendalls <- rep(NA, 104)
negative_trend <- "0"

for(i in 1:104){
  
  speeds <- unique(Catapult_Session_clean[Catapult_Session_clean$anon_id==catapult_ids[i],]$Week.Max.Velocity)
  weeks <- rev(1:length(speeds))
  
  kend <- cor(weeks, speeds, method="kendall")
  
  if(length(speeds)>5){
  kendalls[i] = kend
  
    if(kend <= -0.5){
      negative_trend <- c(negative_trend, catapult_ids[i])
    }
  
  p <- ggplot(filter(Catapult_Session_clean, anon_id == catapult_ids[i]), aes(week,
                                                            Week.Max.Velocity)) +
  geom_point(color=ifelse(kend <= -0.5, "red", "black")) +
  geom_line(color=ifelse(kend <= -0.5, "red", "black")) +
  labs(title = "Average Top Running Speed per Week", subtitle=kend) +
  ylim(0,25)
       
       
  print(p)
  
  
  }
  
}
```
```{r}
hist(kendalls)
abline(v=-0.5)
abline(v=0.5) 
abline(v=median(na.omit(kendalls)))
```
Based on the Kendall Correlations calculated from the average maximum velocity per week for each player we can see that the distributions of correlations shows that most players tend to have a weak Kendall Correlation coefficient. This suggests that player may tend to see a decrease in maximum velocity per week throughout their time at CU. For most players with a negative coefficient though, it is considered really weak. The players with a strong negative correlation coefficient are ID 30, ID 178, ID 220, and ID 65. These are the ones that have almost a definitive negative trend in their maximum running speeds. These players are the only ones that we can say for certain have seen a decrease in their top running speed. 

```{r}
data_130 <- Catapult_Session_clean %>%
  filter(anon_id == "ID_130")
data_178 <- Catapult_Session_clean %>%
  filter(anon_id == "ID_178")
data_220 <- Catapult_Session_clean %>%
  filter(anon_id == "ID_220")
data_65 <- Catapult_Session_clean %>%
  filter(anon_id == "ID_65")

model_130 <- lm(Week.Max.Velocity ~ week, data=data_130)
summary(model_130)
model_178 <- lm(Week.Max.Velocity ~ week, data=data_178)
summary(model_178)
model_220 <- lm(Week.Max.Velocity ~ week, data=data_220)
summary(model_220)
model_65 <- lm(Week.Max.Velocity ~ week, data=data_65)
summary(model_65)
```
Looking at the slope coefficients for players that had the lowest correlation coefficients we can see that all of them had a relatively small slope value. All of them being lower than 0.04 in absolute value. This suggests that while they had a detectable negative trend throughout their time at CU, none of them had any sharp decreases. 


# Section 2: Running Imbalance

## What is the variation at the team level and at each individual athlete level?

  Looking at team data as a whole, since January 1, 2024 there is absolute no deviance from 0. That means that since January 1, 2024, the team has had the same average running imbalance of 0. This makes sense given that the team is so large and that imbalances can go from -100% to 100%. This suggests that throughout this time, at no point was there a team sway to one side. There also weren't any points since January 1, 2024 that the team had any large spikes in average absolute value of running imbalance. This suggests that at no point throughout the season were there larger spikes than normal in running imbalance. 
  Each player tends to have a very unique trend in their running imbalance. Looking at how the team varies but also at how each player varies throughout the season, it's hard to make out any pattern that's applicable to most people. The variance of running imbalance varies greatly between each player. Instead we looked at the variances between players who were injured and those who were not. Based on three different bootstrapped findings, we can see that the variances between players who were injured and those who were not were statistically significant.
  
  For the first bootstrap, we compared the variance of the pooled groups meaning that the variance in running imbalance for players who were injured and those who were not were compared. This resulted in a 90% confidence interval which suggested that the difference in variance between the two groups is between 0.30 and 3.45. This suggests that when looking at the variance of the two groups separately but all the players are pooled together, the variances will most likely be different by factor between 0.30 and 3.45 and the variance for the injured pool will be greater than that of the uninjured pool.
  For the second bootstrap, each player's variance was taken individually. This unpooled approach was taken to see if an individual player's variance in running imbalance could potentially be related to HSI risk. The bootstrap algorithm in this case took the averaged variances of the bootstrapped sample for each group and compared them. This bootstrap produced a 90% confidence interval for the difference in average variance between the two groups is 0.79 to 1.32. These results suggest that players who sustained a hamstring injury since January 1, 2024 had, on average, a greater variance in their running imbalance by about 1.06. This suggests that there is a relationship between variance in running imbalance and HSI risk. This found increase in variability will be used to address the following questions.
  For the third bootstrap, we wanted to see if there was a difference in the average mean absolute value in running imbalance between players who were and weren't injured. The bootstrapping algorithm for this test calculated the average absolute distance value or each running imbalance measurement and found the average for each sample. This test found that at the 90% significance level, injured players had an average running imbalance absolute value between 0.06 and 0.32 greater than their uninjured counterparts. These values though, when we consider that the range of running imbalance goes from 0 to 100 is small and may be hard to detect when out in the field. 
  
  We also looked at the relationship between running imbalance and higher level position. From this analysis we found that there doesn't seem to be a super strong relationship between the three categories and average running imbalance variance. The bootstrap revealed that there are potentially significant differences between those who are Bigs and Combos. But, those who are Skills weren't able to differentiate themselves between the two groups. 
  Along with this, we looked at the average absolute value in running imbalance for the three groups. This analysis told us that while Combos and Bigs tended to have the same average absolute value running imbalance, Skills had a significantly higher average absolute running imbalance value. 
  We can see from the very last chart in this analysis that Skills make up the most of those with hamstring injuries followed closely by Combos and Bigs making up around half of the amount of Skills. This is interesting considering that the amounts of Bigs, Skills, and Combos within the Historical Running data set are all roughly the same. 

### Team Analysis
```{r Looking at team and player variances of running imbalance}
#team variation
Historical_Running_clean %>%
  summarize(Team_Variation = var(Running.Imbalance))

#individual player variation
Historical_Running_clean %>%
  group_by(anon_id) %>%
  summarize(Player_Variation = var(na.omit(Running.Imbalance))) %>%
  ungroup() 

#average variance in running imbalance across all players
Historical_Running_clean %>%
  group_by(anon_id) %>%
  mutate(Player.var = var(na.omit(Running.Imbalance))) %>%
  ungroup() %>%
  summarize(Average_Player_Variance = mean(na.omit(Player.var)))

#making variance and average absolute value for each date to see trends
Historical_Running_clean <- Historical_Running_clean %>%
  group_by(Date) %>%
  mutate(Date.Variance = var(na.omit(Running.Imbalance)),
         Date.Avg.Abs.Value = mean(abs(na.omit(Running.Imbalance)))) %>%
  ungroup()
```


```{r Running imbalance measurements for whole team since January 1, 2024}
#calculating mean and variance for team data
team_mean <- mean(Historical_Running_clean$Running.Imbalance)
team_sd <- sd(Historical_Running_clean$Running.Imbalance)

#making scatter plot of team running imbalance data throughout season
ggplot(Historical_Running_clean, aes(Date, Running.Imbalance)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = team_mean, color = "#CFB87C") +
  geom_hline(yintercept = team_mean + team_sd) +
  geom_hline(yintercept = team_mean - team_sd) +
  geom_hline(yintercept = team_mean + (2*team_sd), color = "#A2A4A3") +
  geom_hline(yintercept = team_mean - (2*team_sd), color = "#A2A4A3") +
  geom_smooth(method = "lm", se = TRUE, color = "#CFB87C") +
  labs(title = "Team Running Imbalance Since January 1, 2024", y="Running Imbalance (%)",
       subtitle = "\u03BC = 0.08623412, \u03C3^2 = 14.94215") +
  theme_minimal()

#making scatter plot of team running imbalance data throughout season
ggplot(Historical_Running_clean, aes(Date, Running.Imbalance)) +
  geom_point(alpha = 0.3) +
  geom_hline(yintercept = team_mean, color = "#CFB87C") +
  geom_smooth(method = "lm", se = TRUE, color = "#CFB87C") +
  geom_line(aes(x=Date, y=Date.Avg.Abs.Value), color = "#CFB87C") +
  geom_line(aes(x=Date, y=-Date.Avg.Abs.Value), color = "#CFB87C") +
  labs(title = "Team Running Imbalance Since January 1, 2024", y="Running Imbalance (%)",
       subtitle = "\u03BC = 0.08623412, \u03C3^2 = 14.94215") +
  theme_minimal()

#making histogram of team running imbalance data
ggplot(Historical_Running_clean, aes(Running.Imbalance)) +
  geom_histogram() +
  labs(title = "Team Running Imbalance Since January 1, 2024", x="Running Imbalance") +
  theme_minimal()
```


### Injured Analysis
```{r Looking at distributions for injured and uninjured separately}
#making histogram for running imbalance of all injured athletes
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Running.Imbalance)) +
  geom_histogram(fill = "#CFB87C", alpha = 0.75) +
  #adding in 95% confidence interval
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.025), color = "#CFB87C") +
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.975), color = "#CFB87C") +
  xlim(-21,21) +
  labs(title = "Running Imbalance for Players with HSI since January 1, 2024") +
  theme_minimal()

#making histogram for running imbalance of all uninjured athletes
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Running.Imbalance)) +
  geom_histogram(fill = "black", alpha = 0.75) +
  #adding in 95% confidence interval
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.025)) +
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.975)) +
  xlim(-21,21) +
  labs(title = "Running Imbalance for Players without HSI since January 1, 2024") +
  theme_minimal()

#Plotting injured and uninjured histograms over top one another
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Running.Imbalance)) +
  geom_histogram(alpha = 0.75) +
  #adding in 95% CI for uninjured players
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.05)) +
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,]$Running.Imbalance, 0.95)) +
  geom_histogram(data = Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Running.Imbalance), fill = "#CFB87C", alpha = 0.75) +
  #adding in 95% CI for injured players
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.05), color = "#CFB87C") +
  geom_vline(xintercept = quantile(Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,]$Running.Imbalance, 0.95), color = "#CFB87C") +
  xlim(-21,21) +
  labs(title = "Running Imbalance for Players with and without HSI since January 1, 2024") +
  theme_minimal()
```


```{r looking at trends for injured and uninjured players since 1-1-2024}
#making scatter plot of running imbalance data for injured players
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,], aes(Date, Running.Imbalance)) + 
  geom_point(alpha = 0.3) +
  ylim(-21,21) +
  labs(title = "Running Imbalance for Players with HSI since January 1, 2024") +
  theme_minimal()

#making scatter plot of running imbalance for uninjured players
ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,], aes(Date, Running.Imbalance)) + 
  geom_point(alpha = 0.3) +
  ylim(-21,21) +
  labs(title = "Running Imbalance for Players without HSI since January 1, 2024") +
  theme_minimal()
```


#### Bootstrapping Differences between Injured and Uninjured Athletes
```{r Separating injured and uninjured into two different data sets}
#Splitting up the data sets and calculating player variance and measurement absolute value
injured_data <- Historical_Running_clean[Historical_Running_clean$anon_id %in% injured_IDs,] %>%
  mutate(Player.Absolute.Dist = abs(Running.Imbalance)) %>%
  group_by(anon_id) %>%
  mutate(Player.Variance = var(Running.Imbalance)) %>%
  ungroup()

uninjured_data <- Historical_Running_clean[Historical_Running_clean$anon_id %in% uninjured_IDs,] %>%
  mutate(Player.Absolute.Dist = abs(Running.Imbalance)) %>%
  group_by(anon_id) %>%
  mutate(Player.Variance = var(Running.Imbalance)) %>%
  ungroup()
```


```{r Boostrapping to look at variances of different groups, pooled}
#making a data frame to hold all of the within group variances
group_variances <- data.frame(injured_var = rep(NA,5000),
                uninjured_var = rep(NA,5000),
                diff_in_var = rep(NA, 5000))

#bootstrap for variance, 5000 iterations
for(i in 1:5000){
  #random seed
  set.seed(i) 
  
  #taking samples from each of the data sets, same number of rows, replacement true
  injured_sample <- sample_n(injured_data, replace = TRUE, size = 1672)
  uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2391)
  
  #storing the calculated variances in data frame
  group_variances[i,1] = var(injured_sample$Running.Imbalance)
  group_variances[i,2] = var(uninjured_sample$Running.Imbalance)
  group_variances[i,3] = group_variances[i,1] - group_variances[i,2]
}
```


```{r Plotting results from pooled boostrapped variances}
ggplot(data=group_variances, aes(diff_in_var)) +
  geom_histogram() +
  geom_vline(xintercept = quantile(group_variances$diff_in_var, 0.05), color= "#CFB87C") +
  geom_vline(xintercept = quantile(group_variances$diff_in_var, 0.95), color= "#CFB87C") +
  labs(x="Difference in Variance") +
  theme_minimal()

ggplot(data=group_variances) +
  geom_histogram(aes(injured_var), alpha = 0.75, fill ="#CFB87C") +
  geom_histogram(aes(uninjured_var), alpha = 0.75) +
  labs(x="Variance") +
  theme_minimal()
```


```{r Bootstrapping to look at average player variances}
#making a data frame to hold all of the average player variances between groups
mean_player_variances <- data.frame(injured_var = rep(NA,5000),
                uninjured_var = rep(NA,5000),
                diff_in_var = rep(NA, 5000))

for(i in 1:5000){
  #random seed
  set.seed(i) 
  
  #taking samples from each of the data sets, same number of rows, replacement true
  injured_sample <- sample_n(injured_data, replace = TRUE, size = 1672)
  uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2391)
  
  #storing the calculated variances in data frame
  mean_player_variances[i,1] = mean(na.omit(injured_sample$Player.Variance))
  mean_player_variances[i,2] = mean(na.omit(uninjured_sample$Player.Variance))
  mean_player_variances[i,3] = mean_player_variances[i,1] - mean_player_variances[i,2]
}
```


```{r Plotting results from averaged bootstrapped player variances}
ggplot(data = mean_player_variances, aes(diff_in_var)) +
  geom_histogram() +
  geom_vline(xintercept = quantile(mean_player_variances$diff_in_var, 0.05), color= "#CFB87C") +
  geom_vline(xintercept = quantile(mean_player_variances$diff_in_var, 0.95), color= "#CFB87C") +
  labs(title = "Difference in Estimated Average Variance for Injured and Uninjured Athletes", x="Difference in Average Variance") +
  theme_minimal()

ggplot(data=mean_player_variances) +
  geom_histogram(aes(injured_var), alpha = 0.75, fill ="#CFB87C") +
  geom_histogram(aes(uninjured_var), alpha = 0.75) +
  labs(title = "Estimated Average Variance for Injured and Uninjured Athletes", x="Average Variance") +
  theme_minimal()
```


```{r Bootstrapping to look at average absolute value running imbalance}
#making a data frame to hold all of the average absolute differences from 0
group_distance <- data.frame(injured_dist = rep(NA,5000),
                uninjured_dist = rep(NA,5000),
                diff_in_dist = rep(NA, 5000))

#bootstrap for variances, 5000 iterations
for(i in 1:5000){
  #random seed
  set.seed(i) 
  
  #taking samples from each of the data sets, same number of rows, replacement true
  injured_sample <- sample_n(injured_data, replace = TRUE, size = 1658)
  uninjured_sample <- sample_n(uninjured_data, replace=TRUE, size = 2405)
  
  #storing the calculated variances in data frame
  group_distance[i,1] = mean(injured_sample$Player.Absolute.Dist)
  group_distance[i,2] = mean(uninjured_sample$Player.Absolute.Dist)
  group_distance[i,3] = group_distance[i,1] - group_distance[i,2]
}
```


```{r Plotting results from averaged boostrapped absolute value running imbalance}
ggplot(data = group_distance, aes(diff_in_dist)) +
  geom_histogram() +
  geom_vline(xintercept = quantile(group_distance$diff_in_dist, 0.05), color= "#CFB87C") +
  geom_vline(xintercept = quantile(group_distance$diff_in_dist, 0.95), color= "#CFB87C") +
  labs(title = "Estimated Difference in Average Absolute Value", x="Difference in Average Absolute Value") +
  theme_minimal()

ggplot(data = group_distance) +
  geom_histogram(aes(injured_dist), alpha = 0.75, fill ="#CFB87C") +
  geom_histogram(aes(uninjured_dist), alpha = 0.75) +
  labs(title = "Estimated Average Absolute Value", x="Average Absolute Value")  +
  theme_minimal()
```


```{r Removing unnecessary objects from loops}
#removing junk that came from the loops
remove(i,team_mean, team_sd, injured_sample, uninjured_sample, group_variances, injured_data, uninjured_data, mean_player_variances, group_distance)
```


### Position Analysis
```{r sorting player into general positions}
#making lists to sort position into larger categories
COMBO <- c("QB","LB","TE","RB", "ILB")
BIG <- c("OL", "DL", "DE", "DT")
SKILL <- c("WR", "DB", "CB", "SAF")
Positions <- c("COMBO", "BIG", "SKILL")

#giving positions to incident report
Incident_Report_clean <- Incident_Report_clean %>%
  mutate(Specific.Position = Position,
         Position = case_when(Specific.Position %in% COMBO ~ "COMBO",
                               Specific.Position %in% BIG ~ "BIG",
                               Specific.Position %in% SKILL ~ "SKILL"))

#giving positions to catapult session
Catapult_Session_clean <- Catapult_Session_clean %>%
  mutate(Specific.Position = Primary.Position,
         Position = case_when(Primary.Position %in% COMBO ~ "COMBO",
                               Primary.Position %in% BIG ~ "BIG",
                               Primary.Position %in% SKILL ~ "SKILL"))

#only taking IDs and position names and categories
incident_info <- Incident_Report_clean[,c("anon_id", "Position", "Specific.Position")]
catapult_info <- Catapult_Session_clean[,c("anon_id", "Position", "Specific.Position")]

#comprehensive list of IDs, their position, and category
info <- distinct(rbind(incident_info, catapult_info))

#add this only historical running
Historical_Running_clean <- left_join(Historical_Running_clean, info, by="anon_id",
                                      relationship="many-to-many") %>%
  #calculating each player's variance in running imbalance
  group_by(anon_id) %>%
  mutate(Player.Variance = var(Running.Imbalance)) %>%
  ungroup()
```


```{r Looking at Running Imbalance }
for(i in 1:3){
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$Position==Positions[i],],aes(Running.Imbalance)) +
    geom_histogram() +
    labs(subtitle=Positions[i])
  
  print(p)
}
```

```{r}
Player_Summary_Stats <- distinct(Historical_Running_clean[,c("anon_id", "Position",
                                                 "Specific.Position", 
                                                 "Player.Variance")])
```


```{r}
for(i in 1:3){
  p <- ggplot(data=Player_Summary_Stats[Player_Summary_Stats$Position==Positions[i],],
              aes(Player.Variance)) +
    geom_histogram() +
    labs(subtitle=Positions[i])
  
  print(p)
}
```


#### Bootstrapping Different Positions
```{r}
#splitting up data set into the different categories
COMBOS <- Player_Summary_Stats %>%
  filter(Position == "COMBO") %>%
  na.omit()

SKILLS <- Player_Summary_Stats %>%
  filter(Position == "SKILL") %>%
  na.omit()

BIGS <- Player_Summary_Stats %>%
  filter(Position == "BIG") %>%
  na.omit()

group_avg_variance <- data.frame(COMBO_var = rep(NA, 5000),
                                 SKILL_var = rep(NA, 5000),
                                 BIG_var = rep(NA, 5000))
```


```{r}
for(i in 1:5000){
  set.seed(i)
  combo_sample <- sample_n(COMBOS, size=20, replace=TRUE)
  skill_sample <- sample_n(SKILLS, size=22, replace=TRUE)
  big_sample <- sample_n(BIGS, size=24, replace=TRUE)
  
  group_avg_variance[i,1] <- mean(na.omit(combo_sample$Player.Variance))
  group_avg_variance[i,2] <- mean(na.omit(skill_sample$Player.Variance))
  group_avg_variance[i,3] <- mean(na.omit(big_sample$Player.Variance))
}
```


```{r}
ggplot(data=group_avg_variance, aes(COMBO_var)) +
  geom_histogram() +
  geom_vline(xintercept = quantile(group_avg_variance$COMBO_var, 0.05)) +
  geom_vline(xintercept = quantile(group_avg_variance$COMBO_var, 0.95)) +
  theme_minimal()

ggplot(data=group_avg_variance, aes(SKILL_var)) +
  geom_histogram(fill="#CFB87C") +
  geom_vline(xintercept = quantile(group_avg_variance$SKILL_var, 0.05), color="#CFB87C") +
  geom_vline(xintercept = quantile(group_avg_variance$SKILL_var, 0.95), color="#CFB87C") +
  theme_minimal()

ggplot(data=group_avg_variance, aes(BIG_var)) +
  geom_histogram(fill="#A2A4A3") +
  geom_vline(xintercept = quantile(group_avg_variance$BIG_var, 0.05), color="#A2A4A3") +
  geom_vline(xintercept = quantile(group_avg_variance$BIG_var, 0.95), color="#A2A4A3") +
  theme_minimal()

ggplot(data=group_avg_variance) +
  geom_histogram(aes(COMBO_var), alpha=0.5, fill="black") +
  geom_histogram(aes(SKILL_var), alpha=0.5, fill="#CFB87C") +
  geom_histogram(aes(BIG_var), alpha=0.5, fill="#A2A4A3") +
  labs(title="Estimated Average Variance in Running Imbalance", x="Average Variance in Running Imbalance") +
  theme_minimal()
```
```{r}
var_confints <- data.frame(Category = c("COMBO", "SKILL", "BIG"),
                       Lower_Bound = c(quantile(group_avg_variance$COMBO_var, 0.05), 
                                       quantile(group_avg_variance$SKILL_var, 0.05), 
                                       quantile(group_avg_variance$BIG_var, 0.05)),
                       Median = c(quantile(group_avg_variance$COMBO_var, 0.5),
                                  quantile(group_avg_variance$SKILL_var, 0.5),
                                  quantile(group_avg_variance$BIG_var, 0.5)),
                       Upper_Bound = c(quantile(group_avg_variance$COMBO_var, 0.95),
                                       quantile(group_avg_variance$SKILL_var, 0.95),
                                       quantile(group_avg_variance$BIG_var, 0.95)))
head(var_confints)

ggplot(data=var_confints, aes(x=Category, y=Median, ymin=Lower_Bound, ymax=Upper_Bound, color=Category)) +
  geom_pointrange() +
  labs(y="Estimated Average Variance in Running Imbalance", title="Running Imbalance Variance 90% CI") +
  scale_color_manual(values = c("COMBO" = "black", "SKILL" = "#CFB87C", "BIG" = "#A2A4A3")) +
  theme_minimal()
```
```{r}
Historical_Running_clean <- Historical_Running_clean %>%
  mutate(Abs.Value.Running.Imbalance = abs(Running.Imbalance))

COMBOS <- Historical_Running_clean %>%
  filter(Position == "COMBO")

SKILLS <- Historical_Running_clean %>%
  filter(Position == "SKILL")

BIGS <- Historical_Running_clean %>%
  filter(Position == "BIG")

group_avg_dist <- data.frame(COMBO_dist = rep(NA, 5000),
                             SKILL_dist = rep(NA, 5000),
                             BIG_dist = rep(NA, 5000))
```

```{r}
for(i in 1:5000){
  set.seed(i)
  combo_sample <- sample_n(COMBOS, size=1203, replace=TRUE)
  skill_sample <- sample_n(SKILLS, size=1635, replace=TRUE)
  big_sample <- sample_n(BIGS, size=1224, replace=TRUE)
  
  group_avg_dist[i,1] <- mean(na.omit(combo_sample$Abs.Value.Running.Imbalance))
  group_avg_dist[i,2] <- mean(na.omit(skill_sample$Abs.Value.Running.Imbalance))
  group_avg_dist[i,3] <- mean(na.omit(big_sample$Abs.Value.Running.Imbalance))
}
```


```{r}
ggplot(data=group_avg_dist) +
  geom_histogram(aes(COMBO_dist), alpha=0.5, fill="black") +
  geom_histogram(aes(SKILL_dist), alpha=0.5, fill="#CFB87C") +
  geom_histogram(aes(BIG_dist), alpha=0.5, fill="#A2A4A3") +
  labs(title = "Estimated Average Absolute Value Running Imbalance", x="Average Absolute Value Running Imbalance") +
  theme_minimal()

```


```{r}
dist_confints <- data.frame(Category = c("COMBO", "SKILL", "BIG"),
                       Lower_Bound = c(quantile(group_avg_dist$COMBO_dist, 0.05), 
                                       quantile(group_avg_dist$SKILL_dist, 0.05), 
                                       quantile(group_avg_dist$BIG_dist, 0.05)),
                       Median = c(quantile(group_avg_dist$COMBO_dist, 0.5),
                                  quantile(group_avg_dist$SKILL_dist, 0.5),
                                  quantile(group_avg_dist$BIG_dist, 0.5)),
                       Upper_Bound = c(quantile(group_avg_dist$COMBO_dist, 0.95),
                                       quantile(group_avg_dist$SKILL_dist, 0.95),
                                       quantile(group_avg_dist$BIG_dist, 0.95)))
head(dist_confints)

ggplot(data=dist_confints, aes(x=Category, y=Median, ymin=Lower_Bound, ymax=Upper_Bound, color=Category)) +
  geom_pointrange() +
  scale_color_manual(values = c("COMBO" = "black", "SKILL" = "#CFB87C", "BIG" = "#A2A4A3")) +
  labs(title="Average Absolute Value Running Imbalance 90% CI", y="Estimated Average Absolute Value Running Imbalance") +
  theme_minimal()
```


```{r}
Injury_Incidents <- distinct(Incident_Report_clean[,c("anon_id", "Position", "Date.of.Injury")])

Position_counts <- na.omit(distinct(Historical_Running_clean[,c("anon_id", "Position")]))

ggplot(Injury_Incidents, aes(Position, fill=Position)) +
  geom_bar() +
  scale_fill_manual(values = c("COMBO" = "black", "SKILL" = "#CFB87C", "BIG" = "#A2A4A3")) +
  labs(title="Hamstring Injuries by Position") +
  theme_minimal()

ggplot(Position_counts, aes(Position, fill=Position)) +
  geom_bar() +
  scale_fill_manual(values = c("COMBO" = "black", "SKILL" = "#CFB87C", "BIG" = "#A2A4A3")) +
  labs(title="Total Position Counts on Team") +
  theme_minimal()
```

```{r Removing unnecessary objects from environment}
remove(big_sample, BIGS, catapult_info, combo_sample, COMBOS, confints, group_avg_variance,
       incident_info, info, p, Player_Summary_Stats, skill_sample, SKILLS, BIG, COMBO, i,
       Positions, SKILL, dist_confints, group_avg_variance, group_avg_dist, group_distance, 
       group_variances, injured_data, injured_sample, Injury_Incidents, mean_player_variances, 
       Position_counts, var_confints)
```


## What is a meaningful change? What red flags should go off when we see a week-to-week change in running imbalance?

  Based on the analysis below, there doesn't seem to be any major discernible differences in running imbalance before or following an injury. This suggests that there may not be a direct link between HSI risk and running imbalance value directly beforehand. Instead, the trends of many players who were injured seems to have a relatively consistent trend not entirely dependent on time. 
  Looking at summary statistics of running imbalance in the weeks leading up to and following a hamstring injury, there are also no glaring trends. For this analysis, we looked at the mean and variance in running imbalance per week leading up to and after an injury for all of the injured players with running imbalance data. This showed us that there is no clear indicator of HSI risk in running imbalance or any summary statistic of it. Instead it may be more useful to look at each player's total running imbalance and their individual variance. This seems to be more of a useful tool for differentiating between injured and uninjured athletes.

```{r Calculating weeks before and after injury occurance based on date, how many injuries player has}
#getting running imbalances for just injured players
Injured_Historical_Running <- Historical_Running_clean %>%
  filter(anon_id %in% injured_IDs)

#making new column to represent when in time injury would be, negative means before injury and positive means after injury, 0 means date of injury if there's data for that day
Injured_Historical_Running$Weeks.After.Injury <- rep(NA, 1658)
Injured_Historical_Running$Injury.Count <- rep(NA, 1658)

#making new column in incident report for the injury count
Incident_Report_clean$Injury.Count <- rep(NA, 122)

#go through all of the injured players in the data set
for(i in 1:22){
  #get the dates each player was injured
  injury_dates <- unique(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)
  
  #go through all of the dates in which the player had an injury
  for(j in 1:length(injury_dates)){
    
    #calculate dates for 1, 2, 3, 4 weeks before and after each injury date
    past_1 <- injury_dates[j]-7
    past_2 <- injury_dates[j]-14
    past_3 <- injury_dates[j]-21
    past_4 <- injury_dates[j]-28
    future_1 <- injury_dates[j]+7
    future_2 <- injury_dates[j]+14
    future_3 <- injury_dates[j]+21
    future_4 <- injury_dates[j]+28
    
    #Calculating how many injuries this is for the player
    injury_count <- as.character((length(injury_dates)) - j + 1)
    
    #compare date of data point for each player to date of injury, store in Weeks.After.Injury column, store injury count
    
    #first week after injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > injury_dates[j] & 
                                 Injured_Historical_Running$Date<=future_1,]$Weeks.After.Injury <- "1"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > injury_dates[j] & 
                                 Injured_Historical_Running$Date<=future_1,]$Injury.Count <- injury_count
    
    #second week after injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_1 & 
                                 Injured_Historical_Running$Date<=future_2,]$Weeks.After.Injury <- "2"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_1 & 
                                 Injured_Historical_Running$Date<=future_2,]$Injury.Count <- injury_count
    
    #third week after injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_2 & 
                                 Injured_Historical_Running$Date<=future_3,]$Weeks.After.Injury <- "3"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_2 & 
                                 Injured_Historical_Running$Date<=future_3,]$Injury.Count <- injury_count
    
    #fourth week after injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_3 & 
                                 Injured_Historical_Running$Date<=future_4,]$Weeks.After.Injury <- "4"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date > future_3 & 
                                 Injured_Historical_Running$Date<=future_4,]$Injury.Count <- injury_count    
    #week right before injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < injury_dates[j] & 
                                 Injured_Historical_Running$Date>=past_1,]$Weeks.After.Injury <- "-1"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < injury_dates[j] & 
                                 Injured_Historical_Running$Date>=past_1,]$Injury.Count <- injury_count
    #two weeks before injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_1 & 
                                 Injured_Historical_Running$Date>=past_2,]$Weeks.After.Injury <- "-2"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_1 & 
                                 Injured_Historical_Running$Date>=past_2,]$Injury.Count <- injury_count
    
    #three weeks before injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_2 & 
                                 Injured_Historical_Running$Date>=past_3,]$Weeks.After.Injury <- "-3"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_2 & 
                                 Injured_Historical_Running$Date>=past_3,]$Injury.Count <- injury_count
        
    #four weeks before injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_3 & 
                                 Injured_Historical_Running$Date>=past_4,]$Weeks.After.Injury <- "-4"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date < past_3 & 
                                 Injured_Historical_Running$Date>=past_4,]$Injury.Count <- injury_count
    
    #Date of Injury
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date == injury_dates[j],]$Weeks.After.Injury <- "0"
    
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] &
                                 Injured_Historical_Running$Date == injury_dates[j],]$Injury.Count <- injury_count
    
    
    #adding injury count to indicent report
    Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i] & Incident_Report_clean$Date.of.Injury==injury_dates[j],]$Injury.Count <- injury_count
    
  }
}


#making weeks after injury and injury count into a factor and combining data sets back together
Injured_Historical_Running <- Injured_Historical_Running %>%
  mutate(Weeks.After.Injury = factor(Weeks.After.Injury),
         Injury.Count = factor(Injury.Count))

Historical_Running_clean <- left_join(Historical_Running_clean, Injured_Historical_Running)

#getting rid of junk that was from the loop
remove(future_1, future_2, future_3, future_4, i, injury_count, injury_dates, j, past_1, past_2, past_3, past_4)
remove(Injured_Historical_Running)
```

### Injury Risk
```{r}
for(i in 1:22){
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id==injured_IDs[i],], aes(Date, Running.Imbalance)) +
    geom_line(linetype=1) +
    geom_point(aes(color=Weeks.After.Injury)) +
    geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury, linetype=2) +
    scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue")) +
    theme_minimal() +
    labs(title="Running Imbalance", subtitle = injured_IDs[i])
  
  print(p)
}
```


```{r}
#making summary statistics of running imbalance per week relative to injury
Historical_Running_clean <- Historical_Running_clean %>%
  group_by(anon_id, Injury.Count, Weeks.After.Injury) %>%
  mutate(Weeks.After.Injury.Variability = var(Running.Imbalance),
         Weeks.After.Injury.Mean = mean(abs(Running.Imbalance))) %>%
  ungroup()
```


```{r warning=FALSE}
for(i in 1:22){
  #looking at mean running imbalance per week before and after injury for each injured player
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Weeks.After.Injury.Mean, group=Injury.Count)) +
  geom_line() +
  geom_point(aes(color=Weeks.After.Injury)) +
    xlim(min(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)-30, max(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)+30) +
  geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury) +
    labs(title="Average Absolute Distance per Week",injured_IDs[i]) +
    scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue"))
  
  print(p)
}

```


```{r}
for(i in 1:22){
  #looking at variance in running imbalance per week before and after injury for each injured player
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Weeks.After.Injury.Variability, group=Injury.Count)) +
  geom_line() +
  geom_point(aes(color=Weeks.After.Injury)) +
    xlim(min(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)-30, max(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)+30) +
  geom_vline(xintercept = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury) +
    labs(title="Variance in Running Imbalance per Week", subtitle=injured_IDs[i]) +
    scale_color_manual(values = c("-4"="green", "-3"="yellow", "-2"="orange", "-1"="red", "0"="black","1"= "purple", "2"="navy", "3"="blue", "4"="skyblue"))
  
  print(p)
}
```

### Player Trends
```{r Looking at detectable trends in player data}
for(i in 1:71){
  #only plotting if there are over 15 data points for each player
  if(nrow(Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],])>15){
    #calculating how strong of a non linear trend there is using a gam
    df <- summary(gam(Running.Imbalance~s(Days.Since.Start),
              data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]))$edf
    #calculating how strong of a linear trend there is using kendall correlation
    Kendall_Cor <- cor(x=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Days.Since.Start, y=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Running.Imbalance, method="kendall")
  
    #if there is a detectable linear relationship in the data
if(abs(Kendall_Cor>0.2) & df<=3){ #linear trend in data
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],], aes(Date, Running.Imbalance)) +
    geom_line() +
    geom_point() +
    geom_smooth(color= "red",
                method="lm",
                se=FALSE) +
    labs(title="Running Imbalance Since January 1, 2024 (linear trend)", subtitle=all_IDs[i],) +
    xlim(as.Date("2025-01-01", "%Y-%m-%d"), as.Date("2025-05-01", "%Y-%m-%d"))
  
  print(p)
}
    #if there is a detectable non-linear trend in the data
  if(df > 3){ #non linear trend in data
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],], aes(Date, Running.Imbalance)) +
    geom_line() +
    geom_point() +
    geom_smooth(color = "skyblue",
                se=FALSE) +
    labs(title="Running Imbalance Since January 1, 2024 (non-linear trend)", subtitle=all_IDs[i],) +
    xlim(as.Date("2025-01-01", "%Y-%m-%d"), as.Date("2025-05-01", "%Y-%m-%d"))
  
  print(p)
  }
    #if there is no detectable trend in the data
    if(df<=3 & abs(Kendall_Cor)<=0.2){
      p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],], aes(Date, Running.Imbalance)) +
    geom_line() +
    geom_point() +
    labs(title="Running Imbalance Since January 1, 2024 (non-linear trend)", subtitle=all_IDs[i],) +
    xlim(as.Date("2025-01-01", "%Y-%m-%d"), as.Date("2025-05-01", "%Y-%m-%d"))
    }
  }
}

#plotted only from January 1, 2025 to May 1, 2025
```


### Detectable Trend with Injury Risk
```{r}
trends <- data.frame(ID = all_IDs,
                     KC = rep(0, 71))
for(i in 1:71){
  #only plotting if there are over 15 data points for each player
  if(nrow(Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],])>15){
    #calculating how strong of a non linear trend there is using a gam
    df <- summary(gam(Running.Imbalance~s(Days.Since.Start),
              data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]))$edf
    #calculating how strong of a linear trend there is using kendall correlation
    Kendall_Cor <- cor(x=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Days.Since.Start, y=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],]$Running.Imbalance, method="kendall")
    
    trends[trends$ID == all_IDs[i],2] = Kendall_Cor
  
    if(all_IDs[i] %in% injured_IDs){
      p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],],
             aes(Date, Running.Imbalance)) +
        geom_point(color="skyblue") +
        geom_line(color="skyblue") +
        labs(subtitle=Kendall_Cor)
    }
    if(!(all_IDs[i] %in% injured_IDs)){
      p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == all_IDs[i],],
             aes(Date, Running.Imbalance)) +
        geom_point() +
        geom_line() +
        labs(subtitle=Kendall_Cor)
    }
    
    print(p)
  }
}

#plotted only from January 1, 2025 to May 1, 2025
```


```{r}
trends <- trends %>%
  mutate(injured = ifelse(ID %in% injured_IDs,1,0))

ggplot(trends[trends$KC>0 & trends$injured==1,], aes(KC)) +
  geom_histogram()

ggplot(trends[trends$KC>0 & trends$injured==0,], aes(KC)) +
  geom_histogram()

```


```{r}
injured_trends <- trends %>%
  filter(injured==1)

uninjured_trends <- trends %>%
  filter(injured==0)


kendall_cors <- data.frame(injured_avg_KC = rep(NA,5000),
                           uninjured_avg_KC = rep(NA,5000),
                           avg_diff_KC = rep(NA,5000))

for(i in 1:5000){
  set.seed(i)
  injured_samp <- sample_n(injured_trends, size=22, replace=TRUE)
  uninjured_samp <- sample_n(uninjured_trends, size=49, replace=TRUE)
  
  kendall_cors[i,1] <- mean(abs(injured_samp$KC))
  kendall_cors[i,2] <- mean(abs(uninjured_samp$KC))
  kendall_cors[i,3] <- kendall_cors[i,1] - kendall_cors[i,2]
}

```


```{r}
ggplot(data=kendall_cors, aes(avg_diff_KC)) +
  geom_histogram() +
  geom_vline(xintercept = quantile(kendall_cors$avg_diff_KC, 0.05), color = "#CFB87C") +
  geom_vline(xintercept = quantile(kendall_cors$avg_diff_KC, 0.95), color = "#CFB87C") +
  labs(title = "Estimated Difference in Average Kendall Rank Correlation Coefficient",
       x = "Difference in Average Kendall Coefficient")

ggplot(data=kendall_cors) +
  geom_histogram(aes(injured_avg_KC), alpha=0.7, fill="#CFB87C") +
  geom_histogram(aes(uninjured_avg_KC), alpha=0.7) +
  labs(title = "Estimated Average Kendall Rank Correlation Coefficient",
       x = "Average Kendall Coefficient")
```


```{r removing unnecessary objects from section}
remove(i, p, df, Kendall_Cor, kendall_cors, trends)
```


## Is running imbalance sensitive enough of a metric to use as a prognosis tool versus a rehab tool?

  Based on the analysis below, we can see that by solely using variance in running imbalance from the time of the injury to the end of the predicted return-to-play range is not a very strong predictor for whether or not it will take longer for a player to recover or not. In this analysis, we used the running imbalance in the time frame starting with the injury date to the end of the prognosis time frame. With these specific running imbalance values, we calculated the variance in running imbalance and whether or not the athlete returned to play within the predicted time frame or not. 
  This analysis found that when using the variance in these time frames as the only predictor in a logistic regression model, the slope coefficient associated with the variance was statistically significant at the $\alpha = 0.01$ significance level. With that though, the cross validated accuracy of the model was only around 0.6 suggesting that it wasn't super strong in practice.
  In order to understand the impact that variance in running imbalance had on whether or not a player returned in the predicted time frame or not, we performed a bootstrap. We separated the observations in which players did and did not return in the predicted time frame into two different data sets. We then sampled from each of these two data sets and calculated the average variance for each sample. This was repeated 5000 times. This gave us an estimate of the average variance in running imbalance for players who returned within the predicted time frame and those who did not.
  This bootstrap revealed that players who did not return within the predicted time frame had a variance greater by roughly 1.2 during their time of recovery than those who returned on time. This tells us that while variance in running imbalance is not directly strong enough to predict whether or not an athlete will return within the predicted time frame, it can be used to supplement prognosis or make adjustments to the prognosis during the time of recovery of a HSI. 

```{r}
#Calculating date back to play in incident report data set
Incident_Report_clean <- Incident_Report_clean %>%
  filter(!is.na(Injury.Prognosis))%>%
  #calculating how long predicted time loss is based on prognosis
         #beginning of predicted range of return
  mutate(Expected.Start.Return = as.Date(ifelse(Injury.Prognosis=="No Expected Time Loss",
                                        Date.of.Injury,
                                        ifelse(Injury.Prognosis=="Less than 1 Week",
                                               Date.of.Injury,
                                               ifelse(Injury.Prognosis=="1-4 Weeks",
                                                      Date.of.Injury+days(7),
                                                      Date.of.Injury+days(28))))),
         #end of predicted range of return
         Expected.End.Return = as.Date(ifelse(Injury.Prognosis=="No Expected Time Loss",
                                        Date.of.Injury,
                                        ifelse(Injury.Prognosis=="Less than 1 Week",
                                               Date.of.Injury+days(7),
                                               ifelse(Injury.Prognosis=="1-4 Weeks",
                                                      Date.of.Injury+days(28),
                                                      Date.of.Injury+days(56)))))) %>%
  group_by(anon_id, Date.of.Injury) %>%
  #calculating actual date cleared to return
  mutate(Actual.Return = Date.of.Injury+days(sum(na.omit(Days.in.Status)))) %>%
  ungroup()
```


```{r Looking at if return to play was in predicted range for each player}
for(i in 1:22){
  #looking at mean running imbalance per week before and after injury for each injured player
  p <- ggplot(data=Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],], aes(Date, Running.Imbalance)) +
  geom_line() +
    #Marking when injury occurred with gold line
  geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury, color="#CFB87C") +
    #Marking actual return date with solid brown line
  geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Actual.Return, color="#8D7334", linetype=1) +
  #Marking beginning of predicted return to play range with grey dotted line
  geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.Start.Return, color="#A2A4A3", linetype=3) +
    #Marking end of predicted return to play range with grey dotted line
  geom_vline(xintercept=Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.End.Return, color="#A2A4A3", linetype=3) +
    labs(title=injured_IDs[i]) +
    annotate("rect", 
             xmin = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.Start.Return,
             xmax = Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Expected.End.Return,
             ymin=min(Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],]$Running.Imbalance), ymax=max(Historical_Running_clean[Historical_Running_clean$anon_id == injured_IDs[i],]$Running.Imbalance), alpha=0.3) +
    theme_minimal()
  
  print(p)
}

```



```{r}
#looking at running imbalance of only injured players
Injured_Historical_Running <- Historical_Running_clean %>%
  filter(anon_id %in% injured_IDs)

#Making binary column if date was in time range of injury prognosis
Injured_Historical_Running$Date.in.Range <- rep(0, 1658)

#go through all of the injured players in the data set
for(i in 1:22){
  #get the dates each player was injured
  injury_dates <- unique(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i],]$Date.of.Injury)
  
  #go through dates of injury for each player
  for(j in 1:length(injury_dates)){
    if(injured_IDs[i] == "ID_50"){ #ID_50 does not have enough running imbalance data
      break
    }
    #get the expected date of return for that instance of injury
    expected_return <- as.Date(Incident_Report_clean[Incident_Report_clean$anon_id==injured_IDs[i] & Incident_Report_clean$Date.of.Injury==injury_dates[j],]$Expected.End.Return[1])
    
    #if the date in running imbalance is between day of injury and last day of prediction, set as 1
    Injured_Historical_Running[Injured_Historical_Running$anon_id==injured_IDs[i] & Injured_Historical_Running$Date >= injury_dates[j] & Injured_Historical_Running$Date <= expected_return,]$Date.in.Range <- 1
  }
}

#making a column for the variance in running imbalance for each injury instance range
Injured_Historical_Running <- Injured_Historical_Running %>%
  filter(Date.in.Range == 1) %>%
  group_by(anon_id, Injury.Count) %>%
  mutate(Injury.Variability = var(Running.Imbalance)) %>%
  ungroup()

#adding injury and running data sets together
Injured_Data <- left_join(Injured_Historical_Running, Incident_Report_clean, by=c("anon_id", "Injury.Count"), relationship = "many-to-many") %>%
  #making new column if player returned in predicted time frame
  mutate(Return.in.Range = ifelse(Actual.Return>=Expected.Start.Return & Actual.Return <= Expected.End.Return, 1, 0)) %>%
  #removing rows that are missing important data
  filter(!is.na(Injury.Variability),
         !is.na(Return.in.Range))
```


### Logistic Regression Model
```{r}
set.seed(1000)
#making a 75% to 25% training to testing split
rows <- sample(1:nrow(Injured_Data), size=(nrow(Injured_Data)*0.75), replace=FALSE)
Injured_train <- Injured_Data[rows,]
Injured_test <- Injured_Data[-rows,]

#building logistic regression model from training data
return_to_play_model <- glm(Return.in.Range~Injury.Variability, data=Injured_train, family="binomial")

#looking at coefficients and p-values
summary(return_to_play_model)

#making predictions on testing data based off of the model built
Injured_test <- Injured_test %>%
  mutate(Prediction = ifelse(predict(return_to_play_model, newdata=Injured_test, type="response")>0.5, 1, 0))

#calculating CER
Injured_test %>%
  summarize(CER = mean(Prediction != Return.in.Range))

```


```{r Cross validating logistic regression model}
#making model with all of the data
return_to_play_cv <- glm(Return.in.Range~Injury.Variability, data=Injured_Data, family="binomial")

#making cost function for CER
cost <- function(obs, pred){
  mean((pred <= 0.5) & obs==1 | (pred > 0.5) & obs==0)
}

set.seed(1000)

#cross validating with K=10
ten_cv <- cv.glm(data=Injured_Data,glmfit=return_to_play_cv,cost,K=10)

#extract average error
ten_cv$delta[1]
```


### Bootstrapping Differences Between In and Out of Range Return to Plays
```{r Making separate data sets for bootstrapped comparison}
#taking only players who recovered in predicted time frame
In_Range <- Injured_Data %>%
  filter(Return.in.Range == 1,
         !is.na(Injury.Variability))

#taking only players who did not recover in the predicted time frame
Out_Range <- Injured_Data %>%
  filter(Return.in.Range == 0,
         !is.na(Injury.Variability))
```


```{r Boostrapping variances across both groups}
#making data frame to hold values
Range_Variances <- data.frame(in.avg.var = rep(NA, 5000),
                              out.avg.var = rep(NA, 5000),
                              diff.avg.var = rep(NA, 5000))

#bootstrapping 5000 times
for(i in 1:5000){
  set.seed(i)
  
  #sample from players who recovered in predicted range
  in_sample <- sample_n(In_Range, size=308, replace=TRUE)
  #sample fro players who did not recover in predicted range
  out_sample <- sample_n(Out_Range, size=477, replace=TRUE)
  
  
  #calculating variances of each sample and storing in data frame
  Range_Variances[i,1] <- mean(in_sample$Injury.Variability)
  Range_Variances[i,2] <- mean(out_sample$Injury.Variability)
  Range_Variances[i,3] <- Range_Variances[i,2] - Range_Variances[i,1] #diff in variances
}
```


```{r Plotting results from bootstrap}
#plotting differences in average variance
ggplot(data=Range_Variances, aes(diff.avg.var)) +
  geom_histogram() +
  #adding in 90% CI (does not include 0)
  geom_vline(xintercept = quantile(Range_Variances$diff.avg.var, 0.05), color ="#CFB87C") +
  geom_vline(xintercept = quantile(Range_Variances$diff.avg.var, 0.95), color ="#CFB87C") +
  labs(title = "Estimated Difference in Average Variance of Running Imbalance",
       x= "Difference in Average Variance Running Imbalance") +
  theme_minimal()

#plotting average variances of difference groups
ggplot(data=Range_Variances) +
  #players who recovered in predicted time frame
  geom_histogram(aes(in.avg.var), alpha = 0.75) +
  #players who did not recover in the predicted time frame
  geom_histogram(aes(out.avg.var), alpha = 0.75, fill ="#CFB87C") +
  labs(title = "Estimated Average Variance in Running Imbalance", 
       x="Average Variance in Running Imbalance") +
  theme_minimal()
```


```{r Removing unnecessary objects created during this analysis}
remove(p,i,j,rows,ten_cv, cost, Injured_Data, Injured_Historical_Running, Injured_test, Injured_train, Range_Variances, return_to_play_cv, return_to_play_model, expected_return, injury_dates, In_Range, Out_Range, in_sample, out_sample)
```